virt/kvm/kvm_main.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * Copyright (C) 2006 Qumranet, Inc.
   9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10  *
  11  * Authors:
  12  *   Avi Kivity   <avi@qumranet.com>
  13  *   Yaniv Kamay  <yaniv@qumranet.com>
  14  */
  15
  16 #include <kvm/iodev.h>
  17
  18 #include <linux/kvm_host.h>
  19 #include <linux/kvm.h>
  20 #include <linux/module.h>
  21 #include <linux/errno.h>
  22 #include <linux/percpu.h>
  23 #include <linux/mm.h>
  24 #include <linux/miscdevice.h>
  25 #include <linux/vmalloc.h>
  26 #include <linux/reboot.h>
  27 #include <linux/debugfs.h>
  28 #include <linux/highmem.h>
  29 #include <linux/file.h>
  30 #include <linux/syscore_ops.h>
  31 #include <linux/cpu.h>
  32 #include <linux/sched/signal.h>
  33 #include <linux/sched/mm.h>
  34 #include <linux/sched/stat.h>
  35 #include <linux/cpumask.h>
  36 #include <linux/smp.h>
  37 #include <linux/anon_inodes.h>
  38 #include <linux/profile.h>
  39 #include <linux/kvm_para.h>
  40 #include <linux/pagemap.h>
  41 #include <linux/mman.h>
  42 #include <linux/swap.h>
  43 #include <linux/bitops.h>
  44 #include <linux/spinlock.h>
  45 #include <linux/compat.h>
  46 #include <linux/srcu.h>
  47 #include <linux/hugetlb.h>
  48 #include <linux/slab.h>
  49 #include <linux/sort.h>
  50 #include <linux/bsearch.h>
  51 #include <linux/io.h>
  52 #include <linux/lockdep.h>
  53 #include <linux/kthread.h>
  54 #include <linux/suspend.h>
  55
  56 #include <asm/processor.h>
  57 #include <asm/ioctl.h>
  58 #include <linux/uaccess.h>
  59
  60 #include "coalesced_mmio.h"
  61 #include "async_pf.h"
  62 #include "kvm_mm.h"
  63 #include "vfio.h"
  64
  65 #include <trace/events/ipi.h>
  66
  67 #define CREATE_TRACE_POINTS
  68 #include <trace/events/kvm.h>
  69
  70 #include <linux/kvm_dirty_ring.h>
  71
  72
  73 /* Worst case buffer size needed for holding an integer. */
  74 #define ITOA_MAX_LEN 12
  75
  76 MODULE_AUTHOR("Qumranet");
  77 MODULE_LICENSE("GPL");
  78
  79 /* Architectures should define their poll value according to the halt latency */
  80 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
  81 module_param(halt_poll_ns, uint, 0644);
  82 EXPORT_SYMBOL_GPL(halt_poll_ns);
  83
  84 /* Default doubles per-vcpu halt_poll_ns. */
  85 unsigned int halt_poll_ns_grow = 2;
  86 module_param(halt_poll_ns_grow, uint, 0644);
  87 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
  88
  89 /* The start value to grow halt_poll_ns from */
  90 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
  91 module_param(halt_poll_ns_grow_start, uint, 0644);
  92 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
  93
  94 /* Default resets per-vcpu halt_poll_ns . */
  95 unsigned int halt_poll_ns_shrink;
  96 module_param(halt_poll_ns_shrink, uint, 0644);
  97 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  98
  99 /*
 100  * Ordering of locks:
 101  *
 102  *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
 103  */
 104
 105 DEFINE_MUTEX(kvm_lock);
 106 LIST_HEAD(vm_list);
 107
 108 static struct kmem_cache *kvm_vcpu_cache;
 109
 110 static __read_mostly struct preempt_ops kvm_preempt_ops;
 111 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
 112
 113 struct dentry *kvm_debugfs_dir;
 114 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 115
 116 static const struct file_operations stat_fops_per_vm;
 117
 118 static struct file_operations kvm_chardev_ops;
 119
 120 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 121                            unsigned long arg);
 122 #ifdef CONFIG_KVM_COMPAT
 123 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
 124                                   unsigned long arg);
 125 #define KVM_COMPAT(c)   .compat_ioctl   = (c)
 126 #else
 127 /*
 128  * For architectures that don't implement a compat infrastructure,
 129  * adopt a double line of defense:
 130  * - Prevent a compat task from opening /dev/kvm
 131  * - If the open has been done by a 64bit task, and the KVM fd
 132  *   passed to a compat task, let the ioctls fail.
 133  */
 134 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
 135                                 unsigned long arg) { return -EINVAL; }
 136
 137 static int kvm_no_compat_open(struct inode *inode, struct file *file)
 138 {
 139         return is_compat_task() ? -ENODEV : 0;
 140 }
 141 #define KVM_COMPAT(c)   .compat_ioctl   = kvm_no_compat_ioctl,  \
 142                         .open           = kvm_no_compat_open
 143 #endif
 144 static int hardware_enable_all(void);
 145 static void hardware_disable_all(void);
 146
 147 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 148
 149 #define KVM_EVENT_CREATE_VM 0
 150 #define KVM_EVENT_DESTROY_VM 1
 151 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 152 static unsigned long long kvm_createvm_count;
 153 static unsigned long long kvm_active_vms;
 154
 155 static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
 156
 157 __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
 158 {
 159 }
 160
 161 bool kvm_is_zone_device_page(struct page *page)
 162 {
 163         /*
 164          * The metadata used by is_zone_device_page() to determine whether or
 165          * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
 166          * the device has been pinned, e.g. by get_user_pages().  WARN if the
 167          * page_count() is zero to help detect bad usage of this helper.
 168          */
 169         if (WARN_ON_ONCE(!page_count(page)))
 170                 return false;
 171
 172         return is_zone_device_page(page);
 173 }
 174
 175 /*
 176  * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
 177  * page, NULL otherwise.  Note, the list of refcounted PG_reserved page types
 178  * is likely incomplete, it has been compiled purely through people wanting to
 179  * back guest with a certain type of memory and encountering issues.
 180  */
 181 struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
 182 {
 183         struct page *page;
 184
 185         if (!pfn_valid(pfn))
 186                 return NULL;
 187
 188         page = pfn_to_page(pfn);
 189         if (!PageReserved(page))
 190                 return page;
 191
 192         /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
 193         if (is_zero_pfn(pfn))
 194                 return page;
 195
 196         /*
 197          * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
 198          * perspective they are "normal" pages, albeit with slightly different
 199          * usage rules.
 200          */
 201         if (kvm_is_zone_device_page(page))
 202                 return page;
 203
 204         return NULL;
 205 }
 206
 207 /*
 208  * Switches to specified vcpu, until a matching vcpu_put()
 209  */
 210 void vcpu_load(struct kvm_vcpu *vcpu)
 211 {
 212         int cpu = get_cpu();
 213
 214         __this_cpu_write(kvm_running_vcpu, vcpu);
 215         preempt_notifier_register(&vcpu->preempt_notifier);
 216         kvm_arch_vcpu_load(vcpu, cpu);
 217         put_cpu();
 218 }
 219 EXPORT_SYMBOL_GPL(vcpu_load);
 220
 221 void vcpu_put(struct kvm_vcpu *vcpu)
 222 {
 223         preempt_disable();
 224         kvm_arch_vcpu_put(vcpu);
 225         preempt_notifier_unregister(&vcpu->preempt_notifier);
 226         __this_cpu_write(kvm_running_vcpu, NULL);
 227         preempt_enable();
 228 }
 229 EXPORT_SYMBOL_GPL(vcpu_put);
 230
 231 /* TODO: merge with kvm_arch_vcpu_should_kick */
 232 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
 233 {
 234         int mode = kvm_vcpu_exiting_guest_mode(vcpu);
 235
 236         /*
 237          * We need to wait for the VCPU to reenable interrupts and get out of
 238          * READING_SHADOW_PAGE_TABLES mode.
 239          */
 240         if (req & KVM_REQUEST_WAIT)
 241                 return mode != OUTSIDE_GUEST_MODE;
 242
 243         /*
 244          * Need to kick a running VCPU, but otherwise there is nothing to do.
 245          */
 246         return mode == IN_GUEST_MODE;
 247 }
 248
 249 static void ack_kick(void *_completed)
 250 {
 251 }
 252
 253 static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
 254 {
 255         if (cpumask_empty(cpus))
 256                 return false;
 257
 258         smp_call_function_many(cpus, ack_kick, NULL, wait);
 259         return true;
 260 }
 261
 262 static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
 263                                   struct cpumask *tmp, int current_cpu)
 264 {
 265         int cpu;
 266
 267         if (likely(!(req & KVM_REQUEST_NO_ACTION)))
 268                 __kvm_make_request(req, vcpu);
 269
 270         if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
 271                 return;
 272
 273         /*
 274          * Note, the vCPU could get migrated to a different pCPU at any point
 275          * after kvm_request_needs_ipi(), which could result in sending an IPI
 276          * to the previous pCPU.  But, that's OK because the purpose of the IPI
 277          * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
 278          * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
 279          * after this point is also OK, as the requirement is only that KVM wait
 280          * for vCPUs that were reading SPTEs _before_ any changes were
 281          * finalized. See kvm_vcpu_kick() for more details on handling requests.
 282          */
 283         if (kvm_request_needs_ipi(vcpu, req)) {
 284                 cpu = READ_ONCE(vcpu->cpu);
 285                 if (cpu != -1 && cpu != current_cpu)
 286                         __cpumask_set_cpu(cpu, tmp);
 287         }
 288 }
 289
 290 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 291                                  unsigned long *vcpu_bitmap)
 292 {
 293         struct kvm_vcpu *vcpu;
 294         struct cpumask *cpus;
 295         int i, me;
 296         bool called;
 297
 298         me = get_cpu();
 299
 300         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 301         cpumask_clear(cpus);
 302
 303         for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
 304                 vcpu = kvm_get_vcpu(kvm, i);
 305                 if (!vcpu)
 306                         continue;
 307                 kvm_make_vcpu_request(vcpu, req, cpus, me);
 308         }
 309
 310         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 311         put_cpu();
 312
 313         return called;
 314 }
 315
 316 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 317                                       struct kvm_vcpu *except)
 318 {
 319         struct kvm_vcpu *vcpu;
 320         struct cpumask *cpus;
 321         unsigned long i;
 322         bool called;
 323         int me;
 324
 325         me = get_cpu();
 326
 327         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 328         cpumask_clear(cpus);
 329
 330         kvm_for_each_vcpu(i, vcpu, kvm) {
 331                 if (vcpu == except)
 332                         continue;
 333                 kvm_make_vcpu_request(vcpu, req, cpus, me);
 334         }
 335
 336         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 337         put_cpu();
 338
 339         return called;
 340 }
 341
 342 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 343 {
 344         return kvm_make_all_cpus_request_except(kvm, req, NULL);
 345 }
 346 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
 347
 348 void kvm_flush_remote_tlbs(struct kvm *kvm)
 349 {
 350         ++kvm->stat.generic.remote_tlb_flush_requests;
 351
 352         /*
 353          * We want to publish modifications to the page tables before reading
 354          * mode. Pairs with a memory barrier in arch-specific code.
 355          * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
 356          * and smp_mb in walk_shadow_page_lockless_begin/end.
 357          * - powerpc: smp_mb in kvmppc_prepare_to_enter.
 358          *
 359          * There is already an smp_mb__after_atomic() before
 360          * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
 361          * barrier here.
 362          */
 363         if (!kvm_arch_flush_remote_tlbs(kvm)
 364             || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 365                 ++kvm->stat.generic.remote_tlb_flush;
 366 }
 367 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 368
 369 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
 370 {
 371         if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
 372                 return;
 373
 374         /*
 375          * Fall back to a flushing entire TLBs if the architecture range-based
 376          * TLB invalidation is unsupported or can't be performed for whatever
 377          * reason.
 378          */
 379         kvm_flush_remote_tlbs(kvm);
 380 }
 381
 382 void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
 383                                    const struct kvm_memory_slot *memslot)
 384 {
 385         /*
 386          * All current use cases for flushing the TLBs for a specific memslot
 387          * are related to dirty logging, and many do the TLB flush out of
 388          * mmu_lock. The interaction between the various operations on memslot
 389          * must be serialized by slots_locks to ensure the TLB flush from one
 390          * operation is observed by any other operation on the same memslot.
 391          */
 392         lockdep_assert_held(&kvm->slots_lock);
 393         kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
 394 }
 395
 396 static void kvm_flush_shadow_all(struct kvm *kvm)
 397 {
 398         kvm_arch_flush_shadow_all(kvm);
 399         kvm_arch_guest_memory_reclaimed(kvm);
 400 }
 401
 402 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 403 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 404                                                gfp_t gfp_flags)
 405 {
 406         gfp_flags |= mc->gfp_zero;
 407
 408         if (mc->kmem_cache)
 409                 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
 410         else
 411                 return (void *)__get_free_page(gfp_flags);
 412 }
 413
 414 int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
 415 {
 416         gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
 417         void *obj;
 418
 419         if (mc->nobjs >= min)
 420                 return 0;
 421
 422         if (unlikely(!mc->objects)) {
 423                 if (WARN_ON_ONCE(!capacity))
 424                         return -EIO;
 425
 426                 mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
 427                 if (!mc->objects)
 428                         return -ENOMEM;
 429
 430                 mc->capacity = capacity;
 431         }
 432
 433         /* It is illegal to request a different capacity across topups. */
 434         if (WARN_ON_ONCE(mc->capacity != capacity))
 435                 return -EIO;
 436
 437         while (mc->nobjs < mc->capacity) {
 438                 obj = mmu_memory_cache_alloc_obj(mc, gfp);
 439                 if (!obj)
 440                         return mc->nobjs >= min ? 0 : -ENOMEM;
 441                 mc->objects[mc->nobjs++] = obj;
 442         }
 443         return 0;
 444 }
 445
 446 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 447 {
 448         return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
 449 }
 450
 451 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 452 {
 453         return mc->nobjs;
 454 }
 455
 456 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 457 {
 458         while (mc->nobjs) {
 459                 if (mc->kmem_cache)
 460                         kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
 461                 else
 462                         free_page((unsigned long)mc->objects[--mc->nobjs]);
 463         }
 464
 465         kvfree(mc->objects);
 466
 467         mc->objects = NULL;
 468         mc->capacity = 0;
 469 }
 470
 471 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 472 {
 473         void *p;
 474
 475         if (WARN_ON(!mc->nobjs))
 476                 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
 477         else
 478                 p = mc->objects[--mc->nobjs];
 479         BUG_ON(!p);
 480         return p;
 481 }
 482 #endif
 483
 484 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 485 {
 486         mutex_init(&vcpu->mutex);
 487         vcpu->cpu = -1;
 488         vcpu->kvm = kvm;
 489         vcpu->vcpu_id = id;
 490         vcpu->pid = NULL;
 491 #ifndef __KVM_HAVE_ARCH_WQP
 492         rcuwait_init(&vcpu->wait);
 493 #endif
 494         kvm_async_pf_vcpu_init(vcpu);
 495
 496         kvm_vcpu_set_in_spin_loop(vcpu, false);
 497         kvm_vcpu_set_dy_eligible(vcpu, false);
 498         vcpu->preempted = false;
 499         vcpu->ready = false;
 500         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 501         vcpu->last_used_slot = NULL;
 502
 503         /* Fill the stats id string for the vcpu */
 504         snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
 505                  task_pid_nr(current), id);
 506 }
 507
 508 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 509 {
 510         kvm_arch_vcpu_destroy(vcpu);
 511         kvm_dirty_ring_free(&vcpu->dirty_ring);
 512
 513         /*
 514          * No need for rcu_read_lock as VCPU_RUN is the only place that changes
 515          * the vcpu->pid pointer, and at destruction time all file descriptors
 516          * are already gone.
 517          */
 518         put_pid(rcu_dereference_protected(vcpu->pid, 1));
 519
 520         free_page((unsigned long)vcpu->run);
 521         kmem_cache_free(kvm_vcpu_cache, vcpu);
 522 }
 523
 524 void kvm_destroy_vcpus(struct kvm *kvm)
 525 {
 526         unsigned long i;
 527         struct kvm_vcpu *vcpu;
 528
 529         kvm_for_each_vcpu(i, vcpu, kvm) {
 530                 kvm_vcpu_destroy(vcpu);
 531                 xa_erase(&kvm->vcpu_array, i);
 532         }
 533
 534         atomic_set(&kvm->online_vcpus, 0);
 535 }
 536 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
 537
 538 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 539 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 540 {
 541         return container_of(mn, struct kvm, mmu_notifier);
 542 }
 543
 544 typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 545
 546 typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
 547                              unsigned long end);
 548
 549 typedef void (*on_unlock_fn_t)(struct kvm *kvm);
 550
 551 struct kvm_hva_range {
 552         unsigned long start;
 553         unsigned long end;
 554         union kvm_mmu_notifier_arg arg;
 555         hva_handler_t handler;
 556         on_lock_fn_t on_lock;
 557         on_unlock_fn_t on_unlock;
 558         bool flush_on_ret;
 559         bool may_block;
 560 };
 561
 562 /*
 563  * Use a dedicated stub instead of NULL to indicate that there is no callback
 564  * function/handler.  The compiler technically can't guarantee that a real
 565  * function will have a non-zero address, and so it will generate code to
 566  * check for !NULL, whereas comparing against a stub will be elided at compile
 567  * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
 568  */
 569 static void kvm_null_fn(void)
 570 {
 571
 572 }
 573 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 574
 575 static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
 576
 577 /* Iterate over each memslot intersecting [start, last] (inclusive) range */
 578 #define kvm_for_each_memslot_in_hva_range(node, slots, start, last)          \
 579         for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
 580              node;                                                           \
 581              node = interval_tree_iter_next(node, start, last))      \
 582
 583 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 584                                                   const struct kvm_hva_range *range)
 585 {
 586         bool ret = false, locked = false;
 587         struct kvm_gfn_range gfn_range;
 588         struct kvm_memory_slot *slot;
 589         struct kvm_memslots *slots;
 590         int i, idx;
 591
 592         if (WARN_ON_ONCE(range->end <= range->start))
 593                 return 0;
 594
 595         /* A null handler is allowed if and only if on_lock() is provided. */
 596         if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
 597                          IS_KVM_NULL_FN(range->handler)))
 598                 return 0;
 599
 600         idx = srcu_read_lock(&kvm->srcu);
 601
 602         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 603                 struct interval_tree_node *node;
 604
 605                 slots = __kvm_memslots(kvm, i);
 606                 kvm_for_each_memslot_in_hva_range(node, slots,
 607                                                   range->start, range->end - 1) {
 608                         unsigned long hva_start, hva_end;
 609
 610                         slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
 611                         hva_start = max(range->start, slot->userspace_addr);
 612                         hva_end = min(range->end, slot->userspace_addr +
 613                                                   (slot->npages << PAGE_SHIFT));
 614
 615                         /*
 616                          * To optimize for the likely case where the address
 617                          * range is covered by zero or one memslots, don't
 618                          * bother making these conditional (to avoid writes on
 619                          * the second or later invocation of the handler).
 620                          */
 621                         gfn_range.arg = range->arg;
 622                         gfn_range.may_block = range->may_block;
 623
 624                         /*
 625                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 626                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 627                          */
 628                         gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
 629                         gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 630                         gfn_range.slot = slot;
 631
 632                         if (!locked) {
 633                                 locked = true;
 634                                 KVM_MMU_LOCK(kvm);
 635                                 if (!IS_KVM_NULL_FN(range->on_lock))
 636                                         range->on_lock(kvm, range->start, range->end);
 637                                 if (IS_KVM_NULL_FN(range->handler))
 638                                         break;
 639                         }
 640                         ret |= range->handler(kvm, &gfn_range);
 641                 }
 642         }
 643
 644         if (range->flush_on_ret && ret)
 645                 kvm_flush_remote_tlbs(kvm);
 646
 647         if (locked) {
 648                 KVM_MMU_UNLOCK(kvm);
 649                 if (!IS_KVM_NULL_FN(range->on_unlock))
 650                         range->on_unlock(kvm);
 651         }
 652
 653         srcu_read_unlock(&kvm->srcu, idx);
 654
 655         /* The notifiers are averse to booleans. :-( */
 656         return (int)ret;
 657 }
 658
 659 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 660                                                 unsigned long start,
 661                                                 unsigned long end,
 662                                                 union kvm_mmu_notifier_arg arg,
 663                                                 hva_handler_t handler)
 664 {
 665         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 666         const struct kvm_hva_range range = {
 667                 .start          = start,
 668                 .end            = end,
 669                 .arg            = arg,
 670                 .handler        = handler,
 671                 .on_lock        = (void *)kvm_null_fn,
 672                 .on_unlock      = (void *)kvm_null_fn,
 673                 .flush_on_ret   = true,
 674                 .may_block      = false,
 675         };
 676
 677         return __kvm_handle_hva_range(kvm, &range);
 678 }
 679
 680 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
 681                                                          unsigned long start,
 682                                                          unsigned long end,
 683                                                          hva_handler_t handler)
 684 {
 685         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 686         const struct kvm_hva_range range = {
 687                 .start          = start,
 688                 .end            = end,
 689                 .handler        = handler,
 690                 .on_lock        = (void *)kvm_null_fn,
 691                 .on_unlock      = (void *)kvm_null_fn,
 692                 .flush_on_ret   = false,
 693                 .may_block      = false,
 694         };
 695
 696         return __kvm_handle_hva_range(kvm, &range);
 697 }
 698
 699 static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 700 {
 701         /*
 702          * Skipping invalid memslots is correct if and only change_pte() is
 703          * surrounded by invalidate_range_{start,end}(), which is currently
 704          * guaranteed by the primary MMU.  If that ever changes, KVM needs to
 705          * unmap the memslot instead of skipping the memslot to ensure that KVM
 706          * doesn't hold references to the old PFN.
 707          */
 708         WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 709
 710         if (range->slot->flags & KVM_MEMSLOT_INVALID)
 711                 return false;
 712
 713         return kvm_set_spte_gfn(kvm, range);
 714 }
 715
 716 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 717                                         struct mm_struct *mm,
 718                                         unsigned long address,
 719                                         pte_t pte)
 720 {
 721         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 722         const union kvm_mmu_notifier_arg arg = { .pte = pte };
 723
 724         trace_kvm_set_spte_hva(address);
 725
 726         /*
 727          * .change_pte() must be surrounded by .invalidate_range_{start,end}().
 728          * If mmu_invalidate_in_progress is zero, then no in-progress
 729          * invalidations, including this one, found a relevant memslot at
 730          * start(); rechecking memslots here is unnecessary.  Note, a false
 731          * positive (count elevated by a different invalidation) is sub-optimal
 732          * but functionally ok.
 733          */
 734         WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 735         if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
 736                 return;
 737
 738         kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
 739 }
 740
 741 void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
 742                               unsigned long end)
 743 {
 744         /*
 745          * The count increase must become visible at unlock time as no
 746          * spte can be established without taking the mmu_lock and
 747          * count is also read inside the mmu_lock critical section.
 748          */
 749         kvm->mmu_invalidate_in_progress++;
 750         if (likely(kvm->mmu_invalidate_in_progress == 1)) {
 751                 kvm->mmu_invalidate_range_start = start;
 752                 kvm->mmu_invalidate_range_end = end;
 753         } else {
 754                 /*
 755                  * Fully tracking multiple concurrent ranges has diminishing
 756                  * returns. Keep things simple and just find the minimal range
 757                  * which includes the current and new ranges. As there won't be
 758                  * enough information to subtract a range after its invalidate
 759                  * completes, any ranges invalidated concurrently will
 760                  * accumulate and persist until all outstanding invalidates
 761                  * complete.
 762                  */
 763                 kvm->mmu_invalidate_range_start =
 764                         min(kvm->mmu_invalidate_range_start, start);
 765                 kvm->mmu_invalidate_range_end =
 766                         max(kvm->mmu_invalidate_range_end, end);
 767         }
 768 }
 769
 770 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 771                                         const struct mmu_notifier_range *range)
 772 {
 773         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 774         const struct kvm_hva_range hva_range = {
 775                 .start          = range->start,
 776                 .end            = range->end,
 777                 .handler        = kvm_unmap_gfn_range,
 778                 .on_lock        = kvm_mmu_invalidate_begin,
 779                 .on_unlock      = kvm_arch_guest_memory_reclaimed,
 780                 .flush_on_ret   = true,
 781                 .may_block      = mmu_notifier_range_blockable(range),
 782         };
 783
 784         trace_kvm_unmap_hva_range(range->start, range->end);
 785
 786         /*
 787          * Prevent memslot modification between range_start() and range_end()
 788          * so that conditionally locking provides the same result in both
 789          * functions.  Without that guarantee, the mmu_invalidate_in_progress
 790          * adjustments will be imbalanced.
 791          *
 792          * Pairs with the decrement in range_end().
 793          */
 794         spin_lock(&kvm->mn_invalidate_lock);
 795         kvm->mn_active_invalidate_count++;
 796         spin_unlock(&kvm->mn_invalidate_lock);
 797
 798         /*
 799          * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
 800          * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
 801          * each cache's lock.  There are relatively few caches in existence at
 802          * any given time, and the caches themselves can check for hva overlap,
 803          * i.e. don't need to rely on memslot overlap checks for performance.
 804          * Because this runs without holding mmu_lock, the pfn caches must use
 805          * mn_active_invalidate_count (see above) instead of
 806          * mmu_invalidate_in_progress.
 807          */
 808         gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
 809                                           hva_range.may_block);
 810
 811         __kvm_handle_hva_range(kvm, &hva_range);
 812
 813         return 0;
 814 }
 815
 816 void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
 817                             unsigned long end)
 818 {
 819         /*
 820          * This sequence increase will notify the kvm page fault that
 821          * the page that is going to be mapped in the spte could have
 822          * been freed.
 823          */
 824         kvm->mmu_invalidate_seq++;
 825         smp_wmb();
 826         /*
 827          * The above sequence increase must be visible before the
 828          * below count decrease, which is ensured by the smp_wmb above
 829          * in conjunction with the smp_rmb in mmu_invalidate_retry().
 830          */
 831         kvm->mmu_invalidate_in_progress--;
 832 }
 833
 834 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 835                                         const struct mmu_notifier_range *range)
 836 {
 837         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 838         const struct kvm_hva_range hva_range = {
 839                 .start          = range->start,
 840                 .end            = range->end,
 841                 .handler        = (void *)kvm_null_fn,
 842                 .on_lock        = kvm_mmu_invalidate_end,
 843                 .on_unlock      = (void *)kvm_null_fn,
 844                 .flush_on_ret   = false,
 845                 .may_block      = mmu_notifier_range_blockable(range),
 846         };
 847         bool wake;
 848
 849         __kvm_handle_hva_range(kvm, &hva_range);
 850
 851         /* Pairs with the increment in range_start(). */
 852         spin_lock(&kvm->mn_invalidate_lock);
 853         wake = (--kvm->mn_active_invalidate_count == 0);
 854         spin_unlock(&kvm->mn_invalidate_lock);
 855
 856         /*
 857          * There can only be one waiter, since the wait happens under
 858          * slots_lock.
 859          */
 860         if (wake)
 861                 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
 862
 863         BUG_ON(kvm->mmu_invalidate_in_progress < 0);
 864 }
 865
 866 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 867                                               struct mm_struct *mm,
 868                                               unsigned long start,
 869                                               unsigned long end)
 870 {
 871         trace_kvm_age_hva(start, end);
 872
 873         return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
 874                                     kvm_age_gfn);
 875 }
 876
 877 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 878                                         struct mm_struct *mm,
 879                                         unsigned long start,
 880                                         unsigned long end)
 881 {
 882         trace_kvm_age_hva(start, end);
 883
 884         /*
 885          * Even though we do not flush TLB, this will still adversely
 886          * affect performance on pre-Haswell Intel EPT, where there is
 887          * no EPT Access Bit to clear so that we have to tear down EPT
 888          * tables instead. If we find this unacceptable, we can always
 889          * add a parameter to kvm_age_hva so that it effectively doesn't
 890          * do anything on clear_young.
 891          *
 892          * Also note that currently we never issue secondary TLB flushes
 893          * from clear_young, leaving this job up to the regular system
 894          * cadence. If we find this inaccurate, we might come up with a
 895          * more sophisticated heuristic later.
 896          */
 897         return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 898 }
 899
 900 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 901                                        struct mm_struct *mm,
 902                                        unsigned long address)
 903 {
 904         trace_kvm_test_age_hva(address);
 905
 906         return kvm_handle_hva_range_no_flush(mn, address, address + 1,
 907                                              kvm_test_age_gfn);
 908 }
 909
 910 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 911                                      struct mm_struct *mm)
 912 {
 913         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 914         int idx;
 915
 916         idx = srcu_read_lock(&kvm->srcu);
 917         kvm_flush_shadow_all(kvm);
 918         srcu_read_unlock(&kvm->srcu, idx);
 919 }
 920
 921 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 922         .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
 923         .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
 924         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
 925         .clear_young            = kvm_mmu_notifier_clear_young,
 926         .test_young             = kvm_mmu_notifier_test_young,
 927         .change_pte             = kvm_mmu_notifier_change_pte,
 928         .release                = kvm_mmu_notifier_release,
 929 };
 930
 931 static int kvm_init_mmu_notifier(struct kvm *kvm)
 932 {
 933         kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 934         return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 935 }
 936
 937 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 938
 939 static int kvm_init_mmu_notifier(struct kvm *kvm)
 940 {
 941         return 0;
 942 }
 943
 944 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 945
 946 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 947 static int kvm_pm_notifier_call(struct notifier_block *bl,
 948                                 unsigned long state,
 949                                 void *unused)
 950 {
 951         struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
 952
 953         return kvm_arch_pm_notifier(kvm, state);
 954 }
 955
 956 static void kvm_init_pm_notifier(struct kvm *kvm)
 957 {
 958         kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
 959         /* Suspend KVM before we suspend ftrace, RCU, etc. */
 960         kvm->pm_notifier.priority = INT_MAX;
 961         register_pm_notifier(&kvm->pm_notifier);
 962 }
 963
 964 static void kvm_destroy_pm_notifier(struct kvm *kvm)
 965 {
 966         unregister_pm_notifier(&kvm->pm_notifier);
 967 }
 968 #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
 969 static void kvm_init_pm_notifier(struct kvm *kvm)
 970 {
 971 }
 972
 973 static void kvm_destroy_pm_notifier(struct kvm *kvm)
 974 {
 975 }
 976 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
 977
 978 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 979 {
 980         if (!memslot->dirty_bitmap)
 981                 return;
 982
 983         kvfree(memslot->dirty_bitmap);
 984         memslot->dirty_bitmap = NULL;
 985 }
 986
 987 /* This does not remove the slot from struct kvm_memslots data structures */
 988 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 989 {
 990         kvm_destroy_dirty_bitmap(slot);
 991
 992         kvm_arch_free_memslot(kvm, slot);
 993
 994         kfree(slot);
 995 }
 996
 997 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
 998 {
 999         struct hlist_node *idnode;
1000         struct kvm_memory_slot *memslot;
1001         int bkt;
1002
1003         /*
1004          * The same memslot objects live in both active and inactive sets,
1005          * arbitrarily free using index '1' so the second invocation of this
1006          * function isn't operating over a structure with dangling pointers
1007          * (even though this function isn't actually touching them).
1008          */
1009         if (!slots->node_idx)
1010                 return;
1011
1012         hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
1013                 kvm_free_memslot(kvm, memslot);
1014 }
1015
1016 static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1017 {
1018         switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1019         case KVM_STATS_TYPE_INSTANT:
1020                 return 0444;
1021         case KVM_STATS_TYPE_CUMULATIVE:
1022         case KVM_STATS_TYPE_PEAK:
1023         default:
1024                 return 0644;
1025         }
1026 }
1027
1028
1029 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1030 {
1031         int i;
1032         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1033                                       kvm_vcpu_stats_header.num_desc;
1034
1035         if (IS_ERR(kvm->debugfs_dentry))
1036                 return;
1037
1038         debugfs_remove_recursive(kvm->debugfs_dentry);
1039
1040         if (kvm->debugfs_stat_data) {
1041                 for (i = 0; i < kvm_debugfs_num_entries; i++)
1042                         kfree(kvm->debugfs_stat_data[i]);
1043                 kfree(kvm->debugfs_stat_data);
1044         }
1045 }
1046
1047 static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
1048 {
1049         static DEFINE_MUTEX(kvm_debugfs_lock);
1050         struct dentry *dent;
1051         char dir_name[ITOA_MAX_LEN * 2];
1052         struct kvm_stat_data *stat_data;
1053         const struct _kvm_stats_desc *pdesc;
1054         int i, ret = -ENOMEM;
1055         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1056                                       kvm_vcpu_stats_header.num_desc;
1057
1058         if (!debugfs_initialized())
1059                 return 0;
1060
1061         snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
1062         mutex_lock(&kvm_debugfs_lock);
1063         dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1064         if (dent) {
1065                 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1066                 dput(dent);
1067                 mutex_unlock(&kvm_debugfs_lock);
1068                 return 0;
1069         }
1070         dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1071         mutex_unlock(&kvm_debugfs_lock);
1072         if (IS_ERR(dent))
1073                 return 0;
1074
1075         kvm->debugfs_dentry = dent;
1076         kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1077                                          sizeof(*kvm->debugfs_stat_data),
1078                                          GFP_KERNEL_ACCOUNT);
1079         if (!kvm->debugfs_stat_data)
1080                 goto out_err;
1081
1082         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1083                 pdesc = &kvm_vm_stats_desc[i];
1084                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1085                 if (!stat_data)
1086                         goto out_err;
1087
1088                 stat_data->kvm = kvm;
1089                 stat_data->desc = pdesc;
1090                 stat_data->kind = KVM_STAT_VM;
1091                 kvm->debugfs_stat_data[i] = stat_data;
1092                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1093                                     kvm->debugfs_dentry, stat_data,
1094                                     &stat_fops_per_vm);
1095         }
1096
1097         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1098                 pdesc = &kvm_vcpu_stats_desc[i];
1099                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1100                 if (!stat_data)
1101                         goto out_err;
1102
1103                 stat_data->kvm = kvm;
1104                 stat_data->desc = pdesc;
1105                 stat_data->kind = KVM_STAT_VCPU;
1106                 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1107                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1108                                     kvm->debugfs_dentry, stat_data,
1109                                     &stat_fops_per_vm);
1110         }
1111
1112         ret = kvm_arch_create_vm_debugfs(kvm);
1113         if (ret)
1114                 goto out_err;
1115
1116         return 0;
1117 out_err:
1118         kvm_destroy_vm_debugfs(kvm);
1119         return ret;
1120 }
1121
1122 /*
1123  * Called after the VM is otherwise initialized, but just before adding it to
1124  * the vm_list.
1125  */
1126 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1127 {
1128         return 0;
1129 }
1130
1131 /*
1132  * Called just after removing the VM from the vm_list, but before doing any
1133  * other destruction.
1134  */
1135 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1136 {
1137 }
1138
1139 /*
1140  * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1141  * be setup already, so we can create arch-specific debugfs entries under it.
1142  * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1143  * a per-arch destroy interface is not needed.
1144  */
1145 int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1146 {
1147         return 0;
1148 }
1149
1150 static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
1151 {
1152         struct kvm *kvm = kvm_arch_alloc_vm();
1153         struct kvm_memslots *slots;
1154         int r = -ENOMEM;
1155         int i, j;
1156
1157         if (!kvm)
1158                 return ERR_PTR(-ENOMEM);
1159
1160         /* KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). */
1161         __module_get(kvm_chardev_ops.owner);
1162
1163         KVM_MMU_LOCK_INIT(kvm);
1164         mmgrab(current->mm);
1165         kvm->mm = current->mm;
1166         kvm_eventfd_init(kvm);
1167         mutex_init(&kvm->lock);
1168         mutex_init(&kvm->irq_lock);
1169         mutex_init(&kvm->slots_lock);
1170         mutex_init(&kvm->slots_arch_lock);
1171         spin_lock_init(&kvm->mn_invalidate_lock);
1172         rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1173         xa_init(&kvm->vcpu_array);
1174
1175         INIT_LIST_HEAD(&kvm->gpc_list);
1176         spin_lock_init(&kvm->gpc_lock);
1177
1178         INIT_LIST_HEAD(&kvm->devices);
1179         kvm->max_vcpus = KVM_MAX_VCPUS;
1180
1181         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1182
1183         /*
1184          * Force subsequent debugfs file creations to fail if the VM directory
1185          * is not created (by kvm_create_vm_debugfs()).
1186          */
1187         kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1188
1189         snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1190                  task_pid_nr(current));
1191
1192         if (init_srcu_struct(&kvm->srcu))
1193                 goto out_err_no_srcu;
1194         if (init_srcu_struct(&kvm->irq_srcu))
1195                 goto out_err_no_irq_srcu;
1196
1197         refcount_set(&kvm->users_count, 1);
1198         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1199                 for (j = 0; j < 2; j++) {
1200                         slots = &kvm->__memslots[i][j];
1201
1202                         atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1203                         slots->hva_tree = RB_ROOT_CACHED;
1204                         slots->gfn_tree = RB_ROOT;
1205                         hash_init(slots->id_hash);
1206                         slots->node_idx = j;
1207
1208                         /* Generations must be different for each address space. */
1209                         slots->generation = i;
1210                 }
1211
1212                 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1213         }
1214
1215         for (i = 0; i < KVM_NR_BUSES; i++) {
1216                 rcu_assign_pointer(kvm->buses[i],
1217                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1218                 if (!kvm->buses[i])
1219                         goto out_err_no_arch_destroy_vm;
1220         }
1221
1222         r = kvm_arch_init_vm(kvm, type);
1223         if (r)
1224                 goto out_err_no_arch_destroy_vm;
1225
1226         r = hardware_enable_all();
1227         if (r)
1228                 goto out_err_no_disable;
1229
1230 #ifdef CONFIG_HAVE_KVM_IRQFD
1231         INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1232 #endif
1233
1234         r = kvm_init_mmu_notifier(kvm);
1235         if (r)
1236                 goto out_err_no_mmu_notifier;
1237
1238         r = kvm_coalesced_mmio_init(kvm);
1239         if (r < 0)
1240                 goto out_no_coalesced_mmio;
1241
1242         r = kvm_create_vm_debugfs(kvm, fdname);
1243         if (r)
1244                 goto out_err_no_debugfs;
1245
1246         r = kvm_arch_post_init_vm(kvm);
1247         if (r)
1248                 goto out_err;
1249
1250         mutex_lock(&kvm_lock);
1251         list_add(&kvm->vm_list, &vm_list);
1252         mutex_unlock(&kvm_lock);
1253
1254         preempt_notifier_inc();
1255         kvm_init_pm_notifier(kvm);
1256
1257         return kvm;
1258
1259 out_err:
1260         kvm_destroy_vm_debugfs(kvm);
1261 out_err_no_debugfs:
1262         kvm_coalesced_mmio_free(kvm);
1263 out_no_coalesced_mmio:
1264 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1265         if (kvm->mmu_notifier.ops)
1266                 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1267 #endif
1268 out_err_no_mmu_notifier:
1269         hardware_disable_all();
1270 out_err_no_disable:
1271         kvm_arch_destroy_vm(kvm);
1272 out_err_no_arch_destroy_vm:
1273         WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1274         for (i = 0; i < KVM_NR_BUSES; i++)
1275                 kfree(kvm_get_bus(kvm, i));
1276         cleanup_srcu_struct(&kvm->irq_srcu);
1277 out_err_no_irq_srcu:
1278         cleanup_srcu_struct(&kvm->srcu);
1279 out_err_no_srcu:
1280         kvm_arch_free_vm(kvm);
1281         mmdrop(current->mm);
1282         module_put(kvm_chardev_ops.owner);
1283         return ERR_PTR(r);
1284 }
1285
1286 static void kvm_destroy_devices(struct kvm *kvm)
1287 {
1288         struct kvm_device *dev, *tmp;
1289
1290         /*
1291          * We do not need to take the kvm->lock here, because nobody else
1292          * has a reference to the struct kvm at this point and therefore
1293          * cannot access the devices list anyhow.
1294          */
1295         list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1296                 list_del(&dev->vm_node);
1297                 dev->ops->destroy(dev);
1298         }
1299 }
1300
1301 static void kvm_destroy_vm(struct kvm *kvm)
1302 {
1303         int i;
1304         struct mm_struct *mm = kvm->mm;
1305
1306         kvm_destroy_pm_notifier(kvm);
1307         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1308         kvm_destroy_vm_debugfs(kvm);
1309         kvm_arch_sync_events(kvm);
1310         mutex_lock(&kvm_lock);
1311         list_del(&kvm->vm_list);
1312         mutex_unlock(&kvm_lock);
1313         kvm_arch_pre_destroy_vm(kvm);
1314
1315         kvm_free_irq_routing(kvm);
1316         for (i = 0; i < KVM_NR_BUSES; i++) {
1317                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1318
1319                 if (bus)
1320                         kvm_io_bus_destroy(bus);
1321                 kvm->buses[i] = NULL;
1322         }
1323         kvm_coalesced_mmio_free(kvm);
1324 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1325         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1326         /*
1327          * At this point, pending calls to invalidate_range_start()
1328          * have completed but no more MMU notifiers will run, so
1329          * mn_active_invalidate_count may remain unbalanced.
1330          * No threads can be waiting in kvm_swap_active_memslots() as the
1331          * last reference on KVM has been dropped, but freeing
1332          * memslots would deadlock without this manual intervention.
1333          */
1334         WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1335         kvm->mn_active_invalidate_count = 0;
1336 #else
1337         kvm_flush_shadow_all(kvm);
1338 #endif
1339         kvm_arch_destroy_vm(kvm);
1340         kvm_destroy_devices(kvm);
1341         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1342                 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1343                 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1344         }
1345         cleanup_srcu_struct(&kvm->irq_srcu);
1346         cleanup_srcu_struct(&kvm->srcu);
1347         kvm_arch_free_vm(kvm);
1348         preempt_notifier_dec();
1349         hardware_disable_all();
1350         mmdrop(mm);
1351         module_put(kvm_chardev_ops.owner);
1352 }
1353
1354 void kvm_get_kvm(struct kvm *kvm)
1355 {
1356         refcount_inc(&kvm->users_count);
1357 }
1358 EXPORT_SYMBOL_GPL(kvm_get_kvm);
1359
1360 /*
1361  * Make sure the vm is not during destruction, which is a safe version of
1362  * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1363  */
1364 bool kvm_get_kvm_safe(struct kvm *kvm)
1365 {
1366         return refcount_inc_not_zero(&kvm->users_count);
1367 }
1368 EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1369
1370 void kvm_put_kvm(struct kvm *kvm)
1371 {
1372         if (refcount_dec_and_test(&kvm->users_count))
1373                 kvm_destroy_vm(kvm);
1374 }
1375 EXPORT_SYMBOL_GPL(kvm_put_kvm);
1376
1377 /*
1378  * Used to put a reference that was taken on behalf of an object associated
1379  * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1380  * of the new file descriptor fails and the reference cannot be transferred to
1381  * its final owner.  In such cases, the caller is still actively using @kvm and
1382  * will fail miserably if the refcount unexpectedly hits zero.
1383  */
1384 void kvm_put_kvm_no_destroy(struct kvm *kvm)
1385 {
1386         WARN_ON(refcount_dec_and_test(&kvm->users_count));
1387 }
1388 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1389
1390 static int kvm_vm_release(struct inode *inode, struct file *filp)
1391 {
1392         struct kvm *kvm = filp->private_data;
1393
1394         kvm_irqfd_release(kvm);
1395
1396         kvm_put_kvm(kvm);
1397         return 0;
1398 }
1399
1400 /*
1401  * Allocation size is twice as large as the actual dirty bitmap size.
1402  * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1403  */
1404 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1405 {
1406         unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1407
1408         memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1409         if (!memslot->dirty_bitmap)
1410                 return -ENOMEM;
1411
1412         return 0;
1413 }
1414
1415 static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1416 {
1417         struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1418         int node_idx_inactive = active->node_idx ^ 1;
1419
1420         return &kvm->__memslots[as_id][node_idx_inactive];
1421 }
1422
1423 /*
1424  * Helper to get the address space ID when one of memslot pointers may be NULL.
1425  * This also serves as a sanity that at least one of the pointers is non-NULL,
1426  * and that their address space IDs don't diverge.
1427  */
1428 static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1429                                   struct kvm_memory_slot *b)
1430 {
1431         if (WARN_ON_ONCE(!a && !b))
1432                 return 0;
1433
1434         if (!a)
1435                 return b->as_id;
1436         if (!b)
1437                 return a->as_id;
1438
1439         WARN_ON_ONCE(a->as_id != b->as_id);
1440         return a->as_id;
1441 }
1442
1443 static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1444                                 struct kvm_memory_slot *slot)
1445 {
1446         struct rb_root *gfn_tree = &slots->gfn_tree;
1447         struct rb_node **node, *parent;
1448         int idx = slots->node_idx;
1449
1450         parent = NULL;
1451         for (node = &gfn_tree->rb_node; *node; ) {
1452                 struct kvm_memory_slot *tmp;
1453
1454                 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1455                 parent = *node;
1456                 if (slot->base_gfn < tmp->base_gfn)
1457                         node = &(*node)->rb_left;
1458                 else if (slot->base_gfn > tmp->base_gfn)
1459                         node = &(*node)->rb_right;
1460                 else
1461                         BUG();
1462         }
1463
1464         rb_link_node(&slot->gfn_node[idx], parent, node);
1465         rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1466 }
1467
1468 static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1469                                struct kvm_memory_slot *slot)
1470 {
1471         rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1472 }
1473
1474 static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1475                                  struct kvm_memory_slot *old,
1476                                  struct kvm_memory_slot *new)
1477 {
1478         int idx = slots->node_idx;
1479
1480         WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1481
1482         rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1483                         &slots->gfn_tree);
1484 }
1485
1486 /*
1487  * Replace @old with @new in the inactive memslots.
1488  *
1489  * With NULL @old this simply adds @new.
1490  * With NULL @new this simply removes @old.
1491  *
1492  * If @new is non-NULL its hva_node[slots_idx] range has to be set
1493  * appropriately.
1494  */
1495 static void kvm_replace_memslot(struct kvm *kvm,
1496                                 struct kvm_memory_slot *old,
1497                                 struct kvm_memory_slot *new)
1498 {
1499         int as_id = kvm_memslots_get_as_id(old, new);
1500         struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1501         int idx = slots->node_idx;
1502
1503         if (old) {
1504                 hash_del(&old->id_node[idx]);
1505                 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1506
1507                 if ((long)old == atomic_long_read(&slots->last_used_slot))
1508                         atomic_long_set(&slots->last_used_slot, (long)new);
1509
1510                 if (!new) {
1511                         kvm_erase_gfn_node(slots, old);
1512                         return;
1513                 }
1514         }
1515
1516         /*
1517          * Initialize @new's hva range.  Do this even when replacing an @old
1518          * slot, kvm_copy_memslot() deliberately does not touch node data.
1519          */
1520         new->hva_node[idx].start = new->userspace_addr;
1521         new->hva_node[idx].last = new->userspace_addr +
1522                                   (new->npages << PAGE_SHIFT) - 1;
1523
1524         /*
1525          * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
1526          * hva_node needs to be swapped with remove+insert even though hva can't
1527          * change when replacing an existing slot.
1528          */
1529         hash_add(slots->id_hash, &new->id_node[idx], new->id);
1530         interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1531
1532         /*
1533          * If the memslot gfn is unchanged, rb_replace_node() can be used to
1534          * switch the node in the gfn tree instead of removing the old and
1535          * inserting the new as two separate operations. Replacement is a
1536          * single O(1) operation versus two O(log(n)) operations for
1537          * remove+insert.
1538          */
1539         if (old && old->base_gfn == new->base_gfn) {
1540                 kvm_replace_gfn_node(slots, old, new);
1541         } else {
1542                 if (old)
1543                         kvm_erase_gfn_node(slots, old);
1544                 kvm_insert_gfn_node(slots, new);
1545         }
1546 }
1547
1548 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1549 {
1550         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1551
1552 #ifdef __KVM_HAVE_READONLY_MEM
1553         valid_flags |= KVM_MEM_READONLY;
1554 #endif
1555
1556         if (mem->flags & ~valid_flags)
1557                 return -EINVAL;
1558
1559         return 0;
1560 }
1561
1562 static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1563 {
1564         struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1565
1566         /* Grab the generation from the activate memslots. */
1567         u64 gen = __kvm_memslots(kvm, as_id)->generation;
1568
1569         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1570         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1571
1572         /*
1573          * Do not store the new memslots while there are invalidations in
1574          * progress, otherwise the locking in invalidate_range_start and
1575          * invalidate_range_end will be unbalanced.
1576          */
1577         spin_lock(&kvm->mn_invalidate_lock);
1578         prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1579         while (kvm->mn_active_invalidate_count) {
1580                 set_current_state(TASK_UNINTERRUPTIBLE);
1581                 spin_unlock(&kvm->mn_invalidate_lock);
1582                 schedule();
1583                 spin_lock(&kvm->mn_invalidate_lock);
1584         }
1585         finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1586         rcu_assign_pointer(kvm->memslots[as_id], slots);
1587         spin_unlock(&kvm->mn_invalidate_lock);
1588
1589         /*
1590          * Acquired in kvm_set_memslot. Must be released before synchronize
1591          * SRCU below in order to avoid deadlock with another thread
1592          * acquiring the slots_arch_lock in an srcu critical section.
1593          */
1594         mutex_unlock(&kvm->slots_arch_lock);
1595
1596         synchronize_srcu_expedited(&kvm->srcu);
1597
1598         /*
1599          * Increment the new memslot generation a second time, dropping the
1600          * update in-progress flag and incrementing the generation based on
1601          * the number of address spaces.  This provides a unique and easily
1602          * identifiable generation number while the memslots are in flux.
1603          */
1604         gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1605
1606         /*
1607          * Generations must be unique even across address spaces.  We do not need
1608          * a global counter for that, instead the generation space is evenly split
1609          * across address spaces.  For example, with two address spaces, address
1610          * space 0 will use generations 0, 2, 4, ... while address space 1 will
1611          * use generations 1, 3, 5, ...
1612          */
1613         gen += KVM_ADDRESS_SPACE_NUM;
1614
1615         kvm_arch_memslots_updated(kvm, gen);
1616
1617         slots->generation = gen;
1618 }
1619
1620 static int kvm_prepare_memory_region(struct kvm *kvm,
1621                                      const struct kvm_memory_slot *old,
1622                                      struct kvm_memory_slot *new,
1623                                      enum kvm_mr_change change)
1624 {
1625         int r;
1626
1627         /*
1628          * If dirty logging is disabled, nullify the bitmap; the old bitmap
1629          * will be freed on "commit".  If logging is enabled in both old and
1630          * new, reuse the existing bitmap.  If logging is enabled only in the
1631          * new and KVM isn't using a ring buffer, allocate and initialize a
1632          * new bitmap.
1633          */
1634         if (change != KVM_MR_DELETE) {
1635                 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1636                         new->dirty_bitmap = NULL;
1637                 else if (old && old->dirty_bitmap)
1638                         new->dirty_bitmap = old->dirty_bitmap;
1639                 else if (kvm_use_dirty_bitmap(kvm)) {
1640                         r = kvm_alloc_dirty_bitmap(new);
1641                         if (r)
1642                                 return r;
1643
1644                         if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1645                                 bitmap_set(new->dirty_bitmap, 0, new->npages);
1646                 }
1647         }
1648
1649         r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1650
1651         /* Free the bitmap on failure if it was allocated above. */
1652         if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
1653                 kvm_destroy_dirty_bitmap(new);
1654
1655         return r;
1656 }
1657
1658 static void kvm_commit_memory_region(struct kvm *kvm,
1659                                      struct kvm_memory_slot *old,
1660                                      const struct kvm_memory_slot *new,
1661                                      enum kvm_mr_change change)
1662 {
1663         int old_flags = old ? old->flags : 0;
1664         int new_flags = new ? new->flags : 0;
1665         /*
1666          * Update the total number of memslot pages before calling the arch
1667          * hook so that architectures can consume the result directly.
1668          */
1669         if (change == KVM_MR_DELETE)
1670                 kvm->nr_memslot_pages -= old->npages;
1671         else if (change == KVM_MR_CREATE)
1672                 kvm->nr_memslot_pages += new->npages;
1673
1674         if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1675                 int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1676                 atomic_set(&kvm->nr_memslots_dirty_logging,
1677                            atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1678         }
1679
1680         kvm_arch_commit_memory_region(kvm, old, new, change);
1681
1682         switch (change) {
1683         case KVM_MR_CREATE:
1684                 /* Nothing more to do. */
1685                 break;
1686         case KVM_MR_DELETE:
1687                 /* Free the old memslot and all its metadata. */
1688                 kvm_free_memslot(kvm, old);
1689                 break;
1690         case KVM_MR_MOVE:
1691         case KVM_MR_FLAGS_ONLY:
1692                 /*
1693                  * Free the dirty bitmap as needed; the below check encompasses
1694                  * both the flags and whether a ring buffer is being used)
1695                  */
1696                 if (old->dirty_bitmap && !new->dirty_bitmap)
1697                         kvm_destroy_dirty_bitmap(old);
1698
1699                 /*
1700                  * The final quirk.  Free the detached, old slot, but only its
1701                  * memory, not any metadata.  Metadata, including arch specific
1702                  * data, may be reused by @new.
1703                  */
1704                 kfree(old);
1705                 break;
1706         default:
1707                 BUG();
1708         }
1709 }
1710
1711 /*
1712  * Activate @new, which must be installed in the inactive slots by the caller,
1713  * by swapping the active slots and then propagating @new to @old once @old is
1714  * unreachable and can be safely modified.
1715  *
1716  * With NULL @old this simply adds @new to @active (while swapping the sets).
1717  * With NULL @new this simply removes @old from @active and frees it
1718  * (while also swapping the sets).
1719  */
1720 static void kvm_activate_memslot(struct kvm *kvm,
1721                                  struct kvm_memory_slot *old,
1722                                  struct kvm_memory_slot *new)
1723 {
1724         int as_id = kvm_memslots_get_as_id(old, new);
1725
1726         kvm_swap_active_memslots(kvm, as_id);
1727
1728         /* Propagate the new memslot to the now inactive memslots. */
1729         kvm_replace_memslot(kvm, old, new);
1730 }
1731
1732 static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1733                              const struct kvm_memory_slot *src)
1734 {
1735         dest->base_gfn = src->base_gfn;
1736         dest->npages = src->npages;
1737         dest->dirty_bitmap = src->dirty_bitmap;
1738         dest->arch = src->arch;
1739         dest->userspace_addr = src->userspace_addr;
1740         dest->flags = src->flags;
1741         dest->id = src->id;
1742         dest->as_id = src->as_id;
1743 }
1744
1745 static void kvm_invalidate_memslot(struct kvm *kvm,
1746                                    struct kvm_memory_slot *old,
1747                                    struct kvm_memory_slot *invalid_slot)
1748 {
1749         /*
1750          * Mark the current slot INVALID.  As with all memslot modifications,
1751          * this must be done on an unreachable slot to avoid modifying the
1752          * current slot in the active tree.
1753          */
1754         kvm_copy_memslot(invalid_slot, old);
1755         invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1756         kvm_replace_memslot(kvm, old, invalid_slot);
1757
1758         /*
1759          * Activate the slot that is now marked INVALID, but don't propagate
1760          * the slot to the now inactive slots. The slot is either going to be
1761          * deleted or recreated as a new slot.
1762          */
1763         kvm_swap_active_memslots(kvm, old->as_id);
1764
1765         /*
1766          * From this point no new shadow pages pointing to a deleted, or moved,
1767          * memslot will be created.  Validation of sp->gfn happens in:
1768          *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1769          *      - kvm_is_visible_gfn (mmu_check_root)
1770          */
1771         kvm_arch_flush_shadow_memslot(kvm, old);
1772         kvm_arch_guest_memory_reclaimed(kvm);
1773
1774         /* Was released by kvm_swap_active_memslots(), reacquire. */
1775         mutex_lock(&kvm->slots_arch_lock);
1776
1777         /*
1778          * Copy the arch-specific field of the newly-installed slot back to the
1779          * old slot as the arch data could have changed between releasing
1780          * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1781          * above.  Writers are required to retrieve memslots *after* acquiring
1782          * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1783          */
1784         old->arch = invalid_slot->arch;
1785 }
1786
1787 static void kvm_create_memslot(struct kvm *kvm,
1788                                struct kvm_memory_slot *new)
1789 {
1790         /* Add the new memslot to the inactive set and activate. */
1791         kvm_replace_memslot(kvm, NULL, new);
1792         kvm_activate_memslot(kvm, NULL, new);
1793 }
1794
1795 static void kvm_delete_memslot(struct kvm *kvm,
1796                                struct kvm_memory_slot *old,
1797                                struct kvm_memory_slot *invalid_slot)
1798 {
1799         /*
1800          * Remove the old memslot (in the inactive memslots) by passing NULL as
1801          * the "new" slot, and for the invalid version in the active slots.
1802          */
1803         kvm_replace_memslot(kvm, old, NULL);
1804         kvm_activate_memslot(kvm, invalid_slot, NULL);
1805 }
1806
1807 static void kvm_move_memslot(struct kvm *kvm,
1808                              struct kvm_memory_slot *old,
1809                              struct kvm_memory_slot *new,
1810                              struct kvm_memory_slot *invalid_slot)
1811 {
1812         /*
1813          * Replace the old memslot in the inactive slots, and then swap slots
1814          * and replace the current INVALID with the new as well.
1815          */
1816         kvm_replace_memslot(kvm, old, new);
1817         kvm_activate_memslot(kvm, invalid_slot, new);
1818 }
1819
1820 static void kvm_update_flags_memslot(struct kvm *kvm,
1821                                      struct kvm_memory_slot *old,
1822                                      struct kvm_memory_slot *new)
1823 {
1824         /*
1825          * Similar to the MOVE case, but the slot doesn't need to be zapped as
1826          * an intermediate step. Instead, the old memslot is simply replaced
1827          * with a new, updated copy in both memslot sets.
1828          */
1829         kvm_replace_memslot(kvm, old, new);
1830         kvm_activate_memslot(kvm, old, new);
1831 }
1832
1833 static int kvm_set_memslot(struct kvm *kvm,
1834                            struct kvm_memory_slot *old,
1835                            struct kvm_memory_slot *new,
1836                            enum kvm_mr_change change)
1837 {
1838         struct kvm_memory_slot *invalid_slot;
1839         int r;
1840
1841         /*
1842          * Released in kvm_swap_active_memslots().
1843          *
1844          * Must be held from before the current memslots are copied until after
1845          * the new memslots are installed with rcu_assign_pointer, then
1846          * released before the synchronize srcu in kvm_swap_active_memslots().
1847          *
1848          * When modifying memslots outside of the slots_lock, must be held
1849          * before reading the pointer to the current memslots until after all
1850          * changes to those memslots are complete.
1851          *
1852          * These rules ensure that installing new memslots does not lose
1853          * changes made to the previous memslots.
1854          */
1855         mutex_lock(&kvm->slots_arch_lock);
1856
1857         /*
1858          * Invalidate the old slot if it's being deleted or moved.  This is
1859          * done prior to actually deleting/moving the memslot to allow vCPUs to
1860          * continue running by ensuring there are no mappings or shadow pages
1861          * for the memslot when it is deleted/moved.  Without pre-invalidation
1862          * (and without a lock), a window would exist between effecting the
1863          * delete/move and committing the changes in arch code where KVM or a
1864          * guest could access a non-existent memslot.
1865          *
1866          * Modifications are done on a temporary, unreachable slot.  The old
1867          * slot needs to be preserved in case a later step fails and the
1868          * invalidation needs to be reverted.
1869          */
1870         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1871                 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1872                 if (!invalid_slot) {
1873                         mutex_unlock(&kvm->slots_arch_lock);
1874                         return -ENOMEM;
1875                 }
1876                 kvm_invalidate_memslot(kvm, old, invalid_slot);
1877         }
1878
1879         r = kvm_prepare_memory_region(kvm, old, new, change);
1880         if (r) {
1881                 /*
1882                  * For DELETE/MOVE, revert the above INVALID change.  No
1883                  * modifications required since the original slot was preserved
1884                  * in the inactive slots.  Changing the active memslots also
1885                  * release slots_arch_lock.
1886                  */
1887                 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1888                         kvm_activate_memslot(kvm, invalid_slot, old);
1889                         kfree(invalid_slot);
1890                 } else {
1891                         mutex_unlock(&kvm->slots_arch_lock);
1892                 }
1893                 return r;
1894         }
1895
1896         /*
1897          * For DELETE and MOVE, the working slot is now active as the INVALID
1898          * version of the old slot.  MOVE is particularly special as it reuses
1899          * the old slot and returns a copy of the old slot (in working_slot).
1900          * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
1901          * old slot is detached but otherwise preserved.
1902          */
1903         if (change == KVM_MR_CREATE)
1904                 kvm_create_memslot(kvm, new);
1905         else if (change == KVM_MR_DELETE)
1906                 kvm_delete_memslot(kvm, old, invalid_slot);
1907         else if (change == KVM_MR_MOVE)
1908                 kvm_move_memslot(kvm, old, new, invalid_slot);
1909         else if (change == KVM_MR_FLAGS_ONLY)
1910                 kvm_update_flags_memslot(kvm, old, new);
1911         else
1912                 BUG();
1913
1914         /* Free the temporary INVALID slot used for DELETE and MOVE. */
1915         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1916                 kfree(invalid_slot);
1917
1918         /*
1919          * No need to refresh new->arch, changes after dropping slots_arch_lock
1920          * will directly hit the final, active memslot.  Architectures are
1921          * responsible for knowing that new->arch may be stale.
1922          */
1923         kvm_commit_memory_region(kvm, old, new, change);
1924
1925         return 0;
1926 }
1927
1928 static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1929                                       gfn_t start, gfn_t end)
1930 {
1931         struct kvm_memslot_iter iter;
1932
1933         kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1934                 if (iter.slot->id != id)
1935                         return true;
1936         }
1937
1938         return false;
1939 }
1940
1941 /*
1942  * Allocate some memory and give it an address in the guest physical address
1943  * space.
1944  *
1945  * Discontiguous memory is allowed, mostly for framebuffers.
1946  *
1947  * Must be called holding kvm->slots_lock for write.
1948  */
1949 int __kvm_set_memory_region(struct kvm *kvm,
1950                             const struct kvm_userspace_memory_region *mem)
1951 {
1952         struct kvm_memory_slot *old, *new;
1953         struct kvm_memslots *slots;
1954         enum kvm_mr_change change;
1955         unsigned long npages;
1956         gfn_t base_gfn;
1957         int as_id, id;
1958         int r;
1959
1960         r = check_memory_region_flags(mem);
1961         if (r)
1962                 return r;
1963
1964         as_id = mem->slot >> 16;
1965         id = (u16)mem->slot;
1966
1967         /* General sanity checks */
1968         if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1969             (mem->memory_size != (unsigned long)mem->memory_size))
1970                 return -EINVAL;
1971         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1972                 return -EINVAL;
1973         /* We can read the guest memory with __xxx_user() later on. */
1974         if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1975             (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1976              !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1977                         mem->memory_size))
1978                 return -EINVAL;
1979         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1980                 return -EINVAL;
1981         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1982                 return -EINVAL;
1983         if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
1984                 return -EINVAL;
1985
1986         slots = __kvm_memslots(kvm, as_id);
1987
1988         /*
1989          * Note, the old memslot (and the pointer itself!) may be invalidated
1990          * and/or destroyed by kvm_set_memslot().
1991          */
1992         old = id_to_memslot(slots, id);
1993
1994         if (!mem->memory_size) {
1995                 if (!old || !old->npages)
1996                         return -EINVAL;
1997
1998                 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
1999                         return -EIO;
2000
2001                 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
2002         }
2003
2004         base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2005         npages = (mem->memory_size >> PAGE_SHIFT);
2006
2007         if (!old || !old->npages) {
2008                 change = KVM_MR_CREATE;
2009
2010                 /*
2011                  * To simplify KVM internals, the total number of pages across
2012                  * all memslots must fit in an unsigned long.
2013                  */
2014                 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2015                         return -EINVAL;
2016         } else { /* Modify an existing slot. */
2017                 if ((mem->userspace_addr != old->userspace_addr) ||
2018                     (npages != old->npages) ||
2019                     ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
2020                         return -EINVAL;
2021
2022                 if (base_gfn != old->base_gfn)
2023                         change = KVM_MR_MOVE;
2024                 else if (mem->flags != old->flags)
2025                         change = KVM_MR_FLAGS_ONLY;
2026                 else /* Nothing to change. */
2027                         return 0;
2028         }
2029
2030         if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
2031             kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
2032                 return -EEXIST;
2033
2034         /* Allocate a slot that will persist in the memslot. */
2035         new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2036         if (!new)
2037                 return -ENOMEM;
2038
2039         new->as_id = as_id;
2040         new->id = id;
2041         new->base_gfn = base_gfn;
2042         new->npages = npages;
2043         new->flags = mem->flags;
2044         new->userspace_addr = mem->userspace_addr;
2045
2046         r = kvm_set_memslot(kvm, old, new, change);
2047         if (r)
2048                 kfree(new);
2049         return r;
2050 }
2051 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2052
2053 int kvm_set_memory_region(struct kvm *kvm,
2054                           const struct kvm_userspace_memory_region *mem)
2055 {
2056         int r;
2057
2058         mutex_lock(&kvm->slots_lock);
2059         r = __kvm_set_memory_region(kvm, mem);
2060         mutex_unlock(&kvm->slots_lock);
2061         return r;
2062 }
2063 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2064
2065 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2066                                           struct kvm_userspace_memory_region *mem)
2067 {
2068         if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
2069                 return -EINVAL;
2070
2071         return kvm_set_memory_region(kvm, mem);
2072 }
2073
2074 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2075 /**
2076  * kvm_get_dirty_log - get a snapshot of dirty pages
2077  * @kvm:        pointer to kvm instance
2078  * @log:        slot id and address to which we copy the log
2079  * @is_dirty:   set to '1' if any dirty pages were found
2080  * @memslot:    set to the associated memslot, always valid on success
2081  */
2082 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2083                       int *is_dirty, struct kvm_memory_slot **memslot)
2084 {
2085         struct kvm_memslots *slots;
2086         int i, as_id, id;
2087         unsigned long n;
2088         unsigned long any = 0;
2089
2090         /* Dirty ring tracking may be exclusive to dirty log tracking */
2091         if (!kvm_use_dirty_bitmap(kvm))
2092                 return -ENXIO;
2093
2094         *memslot = NULL;
2095         *is_dirty = 0;
2096
2097         as_id = log->slot >> 16;
2098         id = (u16)log->slot;
2099         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2100                 return -EINVAL;
2101
2102         slots = __kvm_memslots(kvm, as_id);
2103         *memslot = id_to_memslot(slots, id);
2104         if (!(*memslot) || !(*memslot)->dirty_bitmap)
2105                 return -ENOENT;
2106
2107         kvm_arch_sync_dirty_log(kvm, *memslot);
2108
2109         n = kvm_dirty_bitmap_bytes(*memslot);
2110
2111         for (i = 0; !any && i < n/sizeof(long); ++i)
2112                 any = (*memslot)->dirty_bitmap[i];
2113
2114         if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2115                 return -EFAULT;
2116
2117         if (any)
2118                 *is_dirty = 1;
2119         return 0;
2120 }
2121 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
2122
2123 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2124 /**
2125  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2126  *      and reenable dirty page tracking for the corresponding pages.
2127  * @kvm:        pointer to kvm instance
2128  * @log:        slot id and address to which we copy the log
2129  *
2130  * We need to keep it in mind that VCPU threads can write to the bitmap
2131  * concurrently. So, to avoid losing track of dirty pages we keep the
2132  * following order:
2133  *
2134  *    1. Take a snapshot of the bit and clear it if needed.
2135  *    2. Write protect the corresponding page.
2136  *    3. Copy the snapshot to the userspace.
2137  *    4. Upon return caller flushes TLB's if needed.
2138  *
2139  * Between 2 and 4, the guest may write to the page using the remaining TLB
2140  * entry.  This is not a problem because the page is reported dirty using
2141  * the snapshot taken before and step 4 ensures that writes done after
2142  * exiting to userspace will be logged for the next call.
2143  *
2144  */
2145 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2146 {
2147         struct kvm_memslots *slots;
2148         struct kvm_memory_slot *memslot;
2149         int i, as_id, id;
2150         unsigned long n;
2151         unsigned long *dirty_bitmap;
2152         unsigned long *dirty_bitmap_buffer;
2153         bool flush;
2154
2155         /* Dirty ring tracking may be exclusive to dirty log tracking */
2156         if (!kvm_use_dirty_bitmap(kvm))
2157                 return -ENXIO;
2158
2159         as_id = log->slot >> 16;
2160         id = (u16)log->slot;
2161         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2162                 return -EINVAL;
2163
2164         slots = __kvm_memslots(kvm, as_id);
2165         memslot = id_to_memslot(slots, id);
2166         if (!memslot || !memslot->dirty_bitmap)
2167                 return -ENOENT;
2168
2169         dirty_bitmap = memslot->dirty_bitmap;
2170
2171         kvm_arch_sync_dirty_log(kvm, memslot);
2172
2173         n = kvm_dirty_bitmap_bytes(memslot);
2174         flush = false;
2175         if (kvm->manual_dirty_log_protect) {
2176                 /*
2177                  * Unlike kvm_get_dirty_log, we always return false in *flush,
2178                  * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
2179                  * is some code duplication between this function and
2180                  * kvm_get_dirty_log, but hopefully all architecture
2181                  * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2182                  * can be eliminated.
2183                  */
2184                 dirty_bitmap_buffer = dirty_bitmap;
2185         } else {
2186                 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2187                 memset(dirty_bitmap_buffer, 0, n);
2188
2189                 KVM_MMU_LOCK(kvm);
2190                 for (i = 0; i < n / sizeof(long); i++) {
2191                         unsigned long mask;
2192                         gfn_t offset;
2193
2194                         if (!dirty_bitmap[i])
2195                                 continue;
2196
2197                         flush = true;
2198                         mask = xchg(&dirty_bitmap[i], 0);
2199                         dirty_bitmap_buffer[i] = mask;
2200
2201                         offset = i * BITS_PER_LONG;
2202                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2203                                                                 offset, mask);
2204                 }
2205                 KVM_MMU_UNLOCK(kvm);
2206         }
2207
2208         if (flush)
2209                 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2210
2211         if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2212                 return -EFAULT;
2213         return 0;
2214 }
2215
2216
2217 /**
2218  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2219  * @kvm: kvm instance
2220  * @log: slot id and address to which we copy the log
2221  *
2222  * Steps 1-4 below provide general overview of dirty page logging. See
2223  * kvm_get_dirty_log_protect() function description for additional details.
2224  *
2225  * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2226  * always flush the TLB (step 4) even if previous step failed  and the dirty
2227  * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2228  * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2229  * writes will be marked dirty for next log read.
2230  *
2231  *   1. Take a snapshot of the bit and clear it if needed.
2232  *   2. Write protect the corresponding page.
2233  *   3. Copy the snapshot to the userspace.
2234  *   4. Flush TLB's if needed.
2235  */
2236 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2237                                       struct kvm_dirty_log *log)
2238 {
2239         int r;
2240
2241         mutex_lock(&kvm->slots_lock);
2242
2243         r = kvm_get_dirty_log_protect(kvm, log);
2244
2245         mutex_unlock(&kvm->slots_lock);
2246         return r;
2247 }
2248
2249 /**
2250  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2251  *      and reenable dirty page tracking for the corresponding pages.
2252  * @kvm:        pointer to kvm instance
2253  * @log:        slot id and address from which to fetch the bitmap of dirty pages
2254  */
2255 static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2256                                        struct kvm_clear_dirty_log *log)
2257 {
2258         struct kvm_memslots *slots;
2259         struct kvm_memory_slot *memslot;
2260         int as_id, id;
2261         gfn_t offset;
2262         unsigned long i, n;
2263         unsigned long *dirty_bitmap;
2264         unsigned long *dirty_bitmap_buffer;
2265         bool flush;
2266
2267         /* Dirty ring tracking may be exclusive to dirty log tracking */
2268         if (!kvm_use_dirty_bitmap(kvm))
2269                 return -ENXIO;
2270
2271         as_id = log->slot >> 16;
2272         id = (u16)log->slot;
2273         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2274                 return -EINVAL;
2275
2276         if (log->first_page & 63)
2277                 return -EINVAL;
2278
2279         slots = __kvm_memslots(kvm, as_id);
2280         memslot = id_to_memslot(slots, id);
2281         if (!memslot || !memslot->dirty_bitmap)
2282                 return -ENOENT;
2283
2284         dirty_bitmap = memslot->dirty_bitmap;
2285
2286         n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2287
2288         if (log->first_page > memslot->npages ||
2289             log->num_pages > memslot->npages - log->first_page ||
2290             (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2291             return -EINVAL;
2292
2293         kvm_arch_sync_dirty_log(kvm, memslot);
2294
2295         flush = false;
2296         dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2297         if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2298                 return -EFAULT;
2299
2300         KVM_MMU_LOCK(kvm);
2301         for (offset = log->first_page, i = offset / BITS_PER_LONG,
2302                  n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2303              i++, offset += BITS_PER_LONG) {
2304                 unsigned long mask = *dirty_bitmap_buffer++;
2305                 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2306                 if (!mask)
2307                         continue;
2308
2309                 mask &= atomic_long_fetch_andnot(mask, p);
2310
2311                 /*
2312                  * mask contains the bits that really have been cleared.  This
2313                  * never includes any bits beyond the length of the memslot (if
2314                  * the length is not aligned to 64 pages), therefore it is not
2315                  * a problem if userspace sets them in log->dirty_bitmap.
2316                 */
2317                 if (mask) {
2318                         flush = true;
2319                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2320                                                                 offset, mask);
2321                 }
2322         }
2323         KVM_MMU_UNLOCK(kvm);
2324
2325         if (flush)
2326                 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2327
2328         return 0;
2329 }
2330
2331 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2332                                         struct kvm_clear_dirty_log *log)
2333 {
2334         int r;
2335
2336         mutex_lock(&kvm->slots_lock);
2337
2338         r = kvm_clear_dirty_log_protect(kvm, log);
2339
2340         mutex_unlock(&kvm->slots_lock);
2341         return r;
2342 }
2343 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2344
2345 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2346 {
2347         return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2348 }
2349 EXPORT_SYMBOL_GPL(gfn_to_memslot);
2350
2351 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2352 {
2353         struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2354         u64 gen = slots->generation;
2355         struct kvm_memory_slot *slot;
2356
2357         /*
2358          * This also protects against using a memslot from a different address space,
2359          * since different address spaces have different generation numbers.
2360          */
2361         if (unlikely(gen != vcpu->last_used_slot_gen)) {
2362                 vcpu->last_used_slot = NULL;
2363                 vcpu->last_used_slot_gen = gen;
2364         }
2365
2366         slot = try_get_memslot(vcpu->last_used_slot, gfn);
2367         if (slot)
2368                 return slot;
2369
2370         /*
2371          * Fall back to searching all memslots. We purposely use
2372          * search_memslots() instead of __gfn_to_memslot() to avoid
2373          * thrashing the VM-wide last_used_slot in kvm_memslots.
2374          */
2375         slot = search_memslots(slots, gfn, false);
2376         if (slot) {
2377                 vcpu->last_used_slot = slot;
2378                 return slot;
2379         }
2380
2381         return NULL;
2382 }
2383
2384 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2385 {
2386         struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2387
2388         return kvm_is_visible_memslot(memslot);
2389 }
2390 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2391
2392 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2393 {
2394         struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2395
2396         return kvm_is_visible_memslot(memslot);
2397 }
2398 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2399
2400 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2401 {
2402         struct vm_area_struct *vma;
2403         unsigned long addr, size;
2404
2405         size = PAGE_SIZE;
2406
2407         addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2408         if (kvm_is_error_hva(addr))
2409                 return PAGE_SIZE;
2410
2411         mmap_read_lock(current->mm);
2412         vma = find_vma(current->mm, addr);
2413         if (!vma)
2414                 goto out;
2415
2416         size = vma_kernel_pagesize(vma);
2417
2418 out:
2419         mmap_read_unlock(current->mm);
2420
2421         return size;
2422 }
2423
2424 static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2425 {
2426         return slot->flags & KVM_MEM_READONLY;
2427 }
2428
2429 static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2430                                        gfn_t *nr_pages, bool write)
2431 {
2432         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2433                 return KVM_HVA_ERR_BAD;
2434
2435         if (memslot_is_readonly(slot) && write)
2436                 return KVM_HVA_ERR_RO_BAD;
2437
2438         if (nr_pages)
2439                 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2440
2441         return __gfn_to_hva_memslot(slot, gfn);
2442 }
2443
2444 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2445                                      gfn_t *nr_pages)
2446 {
2447         return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2448 }
2449
2450 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2451                                         gfn_t gfn)
2452 {
2453         return gfn_to_hva_many(slot, gfn, NULL);
2454 }
2455 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2456
2457 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2458 {
2459         return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2460 }
2461 EXPORT_SYMBOL_GPL(gfn_to_hva);
2462
2463 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2464 {
2465         return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2466 }
2467 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2468
2469 /*
2470  * Return the hva of a @gfn and the R/W attribute if possible.
2471  *
2472  * @slot: the kvm_memory_slot which contains @gfn
2473  * @gfn: the gfn to be translated
2474  * @writable: used to return the read/write attribute of the @slot if the hva
2475  * is valid and @writable is not NULL
2476  */
2477 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2478                                       gfn_t gfn, bool *writable)
2479 {
2480         unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2481
2482         if (!kvm_is_error_hva(hva) && writable)
2483                 *writable = !memslot_is_readonly(slot);
2484
2485         return hva;
2486 }
2487
2488 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2489 {
2490         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2491
2492         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2493 }
2494
2495 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2496 {
2497         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2498
2499         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2500 }
2501
2502 static inline int check_user_page_hwpoison(unsigned long addr)
2503 {
2504         int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2505
2506         rc = get_user_pages(addr, 1, flags, NULL);
2507         return rc == -EHWPOISON;
2508 }
2509
2510 /*
2511  * The fast path to get the writable pfn which will be stored in @pfn,
2512  * true indicates success, otherwise false is returned.  It's also the
2513  * only part that runs if we can in atomic context.
2514  */
2515 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2516                             bool *writable, kvm_pfn_t *pfn)
2517 {
2518         struct page *page[1];
2519
2520         /*
2521          * Fast pin a writable pfn only if it is a write fault request
2522          * or the caller allows to map a writable pfn for a read fault
2523          * request.
2524          */
2525         if (!(write_fault || writable))
2526                 return false;
2527
2528         if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2529                 *pfn = page_to_pfn(page[0]);
2530
2531                 if (writable)
2532                         *writable = true;
2533                 return true;
2534         }
2535
2536         return false;
2537 }
2538
2539 /*
2540  * The slow path to get the pfn of the specified host virtual address,
2541  * 1 indicates success, -errno is returned if error is detected.
2542  */
2543 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2544                            bool interruptible, bool *writable, kvm_pfn_t *pfn)
2545 {
2546         unsigned int flags = FOLL_HWPOISON;
2547         struct page *page;
2548         int npages;
2549
2550         might_sleep();
2551
2552         if (writable)
2553                 *writable = write_fault;
2554
2555         if (write_fault)
2556                 flags |= FOLL_WRITE;
2557         if (async)
2558                 flags |= FOLL_NOWAIT;
2559         if (interruptible)
2560                 flags |= FOLL_INTERRUPTIBLE;
2561
2562         npages = get_user_pages_unlocked(addr, 1, &page, flags);
2563         if (npages != 1)
2564                 return npages;
2565
2566         /* map read fault as writable if possible */
2567         if (unlikely(!write_fault) && writable) {
2568                 struct page *wpage;
2569
2570                 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2571                         *writable = true;
2572                         put_page(page);
2573                         page = wpage;
2574                 }
2575         }
2576         *pfn = page_to_pfn(page);
2577         return npages;
2578 }
2579
2580 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2581 {
2582         if (unlikely(!(vma->vm_flags & VM_READ)))
2583                 return false;
2584
2585         if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2586                 return false;
2587
2588         return true;
2589 }
2590
2591 static int kvm_try_get_pfn(kvm_pfn_t pfn)
2592 {
2593         struct page *page = kvm_pfn_to_refcounted_page(pfn);
2594
2595         if (!page)
2596                 return 1;
2597
2598         return get_page_unless_zero(page);
2599 }
2600
2601 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2602                                unsigned long addr, bool write_fault,
2603                                bool *writable, kvm_pfn_t *p_pfn)
2604 {
2605         kvm_pfn_t pfn;
2606         pte_t *ptep;
2607         pte_t pte;
2608         spinlock_t *ptl;
2609         int r;
2610
2611         r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2612         if (r) {
2613                 /*
2614                  * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2615                  * not call the fault handler, so do it here.
2616                  */
2617                 bool unlocked = false;
2618                 r = fixup_user_fault(current->mm, addr,
2619                                      (write_fault ? FAULT_FLAG_WRITE : 0),
2620                                      &unlocked);
2621                 if (unlocked)
2622                         return -EAGAIN;
2623                 if (r)
2624                         return r;
2625
2626                 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2627                 if (r)
2628                         return r;
2629         }
2630
2631         pte = ptep_get(ptep);
2632
2633         if (write_fault && !pte_write(pte)) {
2634                 pfn = KVM_PFN_ERR_RO_FAULT;
2635                 goto out;
2636         }
2637
2638         if (writable)
2639                 *writable = pte_write(pte);
2640         pfn = pte_pfn(pte);
2641
2642         /*
2643          * Get a reference here because callers of *hva_to_pfn* and
2644          * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2645          * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
2646          * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2647          * simply do nothing for reserved pfns.
2648          *
2649          * Whoever called remap_pfn_range is also going to call e.g.
2650          * unmap_mapping_range before the underlying pages are freed,
2651          * causing a call to our MMU notifier.
2652          *
2653          * Certain IO or PFNMAP mappings can be backed with valid
2654          * struct pages, but be allocated without refcounting e.g.,
2655          * tail pages of non-compound higher order allocations, which
2656          * would then underflow the refcount when the caller does the
2657          * required put_page. Don't allow those pages here.
2658          */
2659         if (!kvm_try_get_pfn(pfn))
2660                 r = -EFAULT;
2661
2662 out:
2663         pte_unmap_unlock(ptep, ptl);
2664         *p_pfn = pfn;
2665
2666         return r;
2667 }
2668
2669 /*
2670  * Pin guest page in memory and return its pfn.
2671  * @addr: host virtual address which maps memory to the guest
2672  * @atomic: whether this function can sleep
2673  * @interruptible: whether the process can be interrupted by non-fatal signals
2674  * @async: whether this function need to wait IO complete if the
2675  *         host page is not in the memory
2676  * @write_fault: whether we should get a writable host page
2677  * @writable: whether it allows to map a writable host page for !@write_fault
2678  *
2679  * The function will map a writable host page for these two cases:
2680  * 1): @write_fault = true
2681  * 2): @write_fault = false && @writable, @writable will tell the caller
2682  *     whether the mapping is writable.
2683  */
2684 kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2685                      bool *async, bool write_fault, bool *writable)
2686 {
2687         struct vm_area_struct *vma;
2688         kvm_pfn_t pfn;
2689         int npages, r;
2690
2691         /* we can do it either atomically or asynchronously, not both */
2692         BUG_ON(atomic && async);
2693
2694         if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2695                 return pfn;
2696
2697         if (atomic)
2698                 return KVM_PFN_ERR_FAULT;
2699
2700         npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2701                                  writable, &pfn);
2702         if (npages == 1)
2703                 return pfn;
2704         if (npages == -EINTR)
2705                 return KVM_PFN_ERR_SIGPENDING;
2706
2707         mmap_read_lock(current->mm);
2708         if (npages == -EHWPOISON ||
2709               (!async && check_user_page_hwpoison(addr))) {
2710                 pfn = KVM_PFN_ERR_HWPOISON;
2711                 goto exit;
2712         }
2713
2714 retry:
2715         vma = vma_lookup(current->mm, addr);
2716
2717         if (vma == NULL)
2718                 pfn = KVM_PFN_ERR_FAULT;
2719         else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2720                 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
2721                 if (r == -EAGAIN)
2722                         goto retry;
2723                 if (r < 0)
2724                         pfn = KVM_PFN_ERR_FAULT;
2725         } else {
2726                 if (async && vma_is_valid(vma, write_fault))
2727                         *async = true;
2728                 pfn = KVM_PFN_ERR_FAULT;
2729         }
2730 exit:
2731         mmap_read_unlock(current->mm);
2732         return pfn;
2733 }
2734
2735 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
2736                                bool atomic, bool interruptible, bool *async,
2737                                bool write_fault, bool *writable, hva_t *hva)
2738 {
2739         unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2740
2741         if (hva)
2742                 *hva = addr;
2743
2744         if (addr == KVM_HVA_ERR_RO_BAD) {
2745                 if (writable)
2746                         *writable = false;
2747                 return KVM_PFN_ERR_RO_FAULT;
2748         }
2749
2750         if (kvm_is_error_hva(addr)) {
2751                 if (writable)
2752                         *writable = false;
2753                 return KVM_PFN_NOSLOT;
2754         }
2755
2756         /* Do not map writable pfn in the readonly memslot. */
2757         if (writable && memslot_is_readonly(slot)) {
2758                 *writable = false;
2759                 writable = NULL;
2760         }
2761
2762         return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
2763                           writable);
2764 }
2765 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2766
2767 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2768                       bool *writable)
2769 {
2770         return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
2771                                     NULL, write_fault, writable, NULL);
2772 }
2773 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2774
2775 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
2776 {
2777         return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
2778                                     NULL, NULL);
2779 }
2780 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2781
2782 kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
2783 {
2784         return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
2785                                     NULL, NULL);
2786 }
2787 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2788
2789 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2790 {
2791         return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2792 }
2793 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2794
2795 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2796 {
2797         return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2798 }
2799 EXPORT_SYMBOL_GPL(gfn_to_pfn);
2800
2801 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2802 {
2803         return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2804 }
2805 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2806
2807 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2808                             struct page **pages, int nr_pages)
2809 {
2810         unsigned long addr;
2811         gfn_t entry = 0;
2812
2813         addr = gfn_to_hva_many(slot, gfn, &entry);
2814         if (kvm_is_error_hva(addr))
2815                 return -1;
2816
2817         if (entry < nr_pages)
2818                 return 0;
2819
2820         return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2821 }
2822 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2823
2824 /*
2825  * Do not use this helper unless you are absolutely certain the gfn _must_ be
2826  * backed by 'struct page'.  A valid example is if the backing memslot is
2827  * controlled by KVM.  Note, if the returned page is valid, it's refcount has
2828  * been elevated by gfn_to_pfn().
2829  */
2830 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2831 {
2832         struct page *page;
2833         kvm_pfn_t pfn;
2834
2835         pfn = gfn_to_pfn(kvm, gfn);
2836
2837         if (is_error_noslot_pfn(pfn))
2838                 return KVM_ERR_PTR_BAD_PAGE;
2839
2840         page = kvm_pfn_to_refcounted_page(pfn);
2841         if (!page)
2842                 return KVM_ERR_PTR_BAD_PAGE;
2843
2844         return page;
2845 }
2846 EXPORT_SYMBOL_GPL(gfn_to_page);
2847
2848 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
2849 {
2850         if (dirty)
2851                 kvm_release_pfn_dirty(pfn);
2852         else
2853                 kvm_release_pfn_clean(pfn);
2854 }
2855
2856 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2857 {
2858         kvm_pfn_t pfn;
2859         void *hva = NULL;
2860         struct page *page = KVM_UNMAPPED_PAGE;
2861
2862         if (!map)
2863                 return -EINVAL;
2864
2865         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2866         if (is_error_noslot_pfn(pfn))
2867                 return -EINVAL;
2868
2869         if (pfn_valid(pfn)) {
2870                 page = pfn_to_page(pfn);
2871                 hva = kmap(page);
2872 #ifdef CONFIG_HAS_IOMEM
2873         } else {
2874                 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2875 #endif
2876         }
2877
2878         if (!hva)
2879                 return -EFAULT;
2880
2881         map->page = page;
2882         map->hva = hva;
2883         map->pfn = pfn;
2884         map->gfn = gfn;
2885
2886         return 0;
2887 }
2888 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2889
2890 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2891 {
2892         if (!map)
2893                 return;
2894
2895         if (!map->hva)
2896                 return;
2897
2898         if (map->page != KVM_UNMAPPED_PAGE)
2899                 kunmap(map->page);
2900 #ifdef CONFIG_HAS_IOMEM
2901         else
2902                 memunmap(map->hva);
2903 #endif
2904
2905         if (dirty)
2906                 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
2907
2908         kvm_release_pfn(map->pfn, dirty);
2909
2910         map->hva = NULL;
2911         map->page = NULL;
2912 }
2913 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2914
2915 static bool kvm_is_ad_tracked_page(struct page *page)
2916 {
2917         /*
2918          * Per page-flags.h, pages tagged PG_reserved "should in general not be
2919          * touched (e.g. set dirty) except by its owner".
2920          */
2921         return !PageReserved(page);
2922 }
2923
2924 static void kvm_set_page_dirty(struct page *page)
2925 {
2926         if (kvm_is_ad_tracked_page(page))
2927                 SetPageDirty(page);
2928 }
2929
2930 static void kvm_set_page_accessed(struct page *page)
2931 {
2932         if (kvm_is_ad_tracked_page(page))
2933                 mark_page_accessed(page);
2934 }
2935
2936 void kvm_release_page_clean(struct page *page)
2937 {
2938         WARN_ON(is_error_page(page));
2939
2940         kvm_set_page_accessed(page);
2941         put_page(page);
2942 }
2943 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2944
2945 void kvm_release_pfn_clean(kvm_pfn_t pfn)
2946 {
2947         struct page *page;
2948
2949         if (is_error_noslot_pfn(pfn))
2950                 return;
2951
2952         page = kvm_pfn_to_refcounted_page(pfn);
2953         if (!page)
2954                 return;
2955
2956         kvm_release_page_clean(page);
2957 }
2958 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2959
2960 void kvm_release_page_dirty(struct page *page)
2961 {
2962         WARN_ON(is_error_page(page));
2963
2964         kvm_set_page_dirty(page);
2965         kvm_release_page_clean(page);
2966 }
2967 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2968
2969 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2970 {
2971         struct page *page;
2972
2973         if (is_error_noslot_pfn(pfn))
2974                 return;
2975
2976         page = kvm_pfn_to_refcounted_page(pfn);
2977         if (!page)
2978                 return;
2979
2980         kvm_release_page_dirty(page);
2981 }
2982 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2983
2984 /*
2985  * Note, checking for an error/noslot pfn is the caller's responsibility when
2986  * directly marking a page dirty/accessed.  Unlike the "release" helpers, the
2987  * "set" helpers are not to be used when the pfn might point at garbage.
2988  */
2989 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2990 {
2991         if (WARN_ON(is_error_noslot_pfn(pfn)))
2992                 return;
2993
2994         if (pfn_valid(pfn))
2995                 kvm_set_page_dirty(pfn_to_page(pfn));
2996 }
2997 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2998
2999 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
3000 {
3001         if (WARN_ON(is_error_noslot_pfn(pfn)))
3002                 return;
3003
3004         if (pfn_valid(pfn))
3005                 kvm_set_page_accessed(pfn_to_page(pfn));
3006 }
3007 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
3008
3009 static int next_segment(unsigned long len, int offset)
3010 {
3011         if (len > PAGE_SIZE - offset)
3012                 return PAGE_SIZE - offset;
3013         else
3014                 return len;
3015 }
3016
3017 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3018                                  void *data, int offset, int len)
3019 {
3020         int r;
3021         unsigned long addr;
3022
3023         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3024         if (kvm_is_error_hva(addr))
3025                 return -EFAULT;
3026         r = __copy_from_user(data, (void __user *)addr + offset, len);
3027         if (r)
3028                 return -EFAULT;
3029         return 0;
3030 }
3031
3032 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3033                         int len)
3034 {
3035         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3036
3037         return __kvm_read_guest_page(slot, gfn, data, offset, len);
3038 }
3039 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3040
3041 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3042                              int offset, int len)
3043 {
3044         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3045
3046         return __kvm_read_guest_page(slot, gfn, data, offset, len);
3047 }
3048 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3049
3050 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3051 {
3052         gfn_t gfn = gpa >> PAGE_SHIFT;
3053         int seg;
3054         int offset = offset_in_page(gpa);
3055         int ret;
3056
3057         while ((seg = next_segment(len, offset)) != 0) {
3058                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3059                 if (ret < 0)
3060                         return ret;
3061                 offset = 0;
3062                 len -= seg;
3063                 data += seg;
3064                 ++gfn;
3065         }
3066         return 0;
3067 }
3068 EXPORT_SYMBOL_GPL(kvm_read_guest);
3069
3070 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
3071 {
3072         gfn_t gfn = gpa >> PAGE_SHIFT;
3073         int seg;
3074         int offset = offset_in_page(gpa);
3075         int ret;
3076
3077         while ((seg = next_segment(len, offset)) != 0) {
3078                 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3079                 if (ret < 0)
3080                         return ret;
3081                 offset = 0;
3082                 len -= seg;
3083                 data += seg;
3084                 ++gfn;
3085         }
3086         return 0;
3087 }
3088 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
3089
3090 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3091                                    void *data, int offset, unsigned long len)
3092 {
3093         int r;
3094         unsigned long addr;
3095
3096         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3097         if (kvm_is_error_hva(addr))
3098                 return -EFAULT;
3099         pagefault_disable();
3100         r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
3101         pagefault_enable();
3102         if (r)
3103                 return -EFAULT;
3104         return 0;
3105 }
3106
3107 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3108                                void *data, unsigned long len)
3109 {
3110         gfn_t gfn = gpa >> PAGE_SHIFT;
3111         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3112         int offset = offset_in_page(gpa);
3113
3114         return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3115 }
3116 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3117
3118 static int __kvm_write_guest_page(struct kvm *kvm,
3119                                   struct kvm_memory_slot *memslot, gfn_t gfn,
3120                                   const void *data, int offset, int len)
3121 {
3122         int r;
3123         unsigned long addr;
3124
3125         addr = gfn_to_hva_memslot(memslot, gfn);
3126         if (kvm_is_error_hva(addr))
3127                 return -EFAULT;
3128         r = __copy_to_user((void __user *)addr + offset, data, len);
3129         if (r)
3130                 return -EFAULT;
3131         mark_page_dirty_in_slot(kvm, memslot, gfn);
3132         return 0;
3133 }
3134
3135 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3136                          const void *data, int offset, int len)
3137 {
3138         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3139
3140         return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
3141 }
3142 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3143
3144 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3145                               const void *data, int offset, int len)
3146 {
3147         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3148
3149         return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
3150 }
3151 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3152
3153 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3154                     unsigned long len)
3155 {
3156         gfn_t gfn = gpa >> PAGE_SHIFT;
3157         int seg;
3158         int offset = offset_in_page(gpa);
3159         int ret;
3160
3161         while ((seg = next_segment(len, offset)) != 0) {
3162                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3163                 if (ret < 0)
3164                         return ret;
3165                 offset = 0;
3166                 len -= seg;
3167                 data += seg;
3168                 ++gfn;
3169         }
3170         return 0;
3171 }
3172 EXPORT_SYMBOL_GPL(kvm_write_guest);
3173
3174 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3175                          unsigned long len)
3176 {
3177         gfn_t gfn = gpa >> PAGE_SHIFT;
3178         int seg;
3179         int offset = offset_in_page(gpa);
3180         int ret;
3181
3182         while ((seg = next_segment(len, offset)) != 0) {
3183                 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3184                 if (ret < 0)
3185                         return ret;
3186                 offset = 0;
3187                 len -= seg;
3188                 data += seg;
3189                 ++gfn;
3190         }
3191         return 0;
3192 }
3193 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3194
3195 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3196                                        struct gfn_to_hva_cache *ghc,
3197                                        gpa_t gpa, unsigned long len)
3198 {
3199         int offset = offset_in_page(gpa);
3200         gfn_t start_gfn = gpa >> PAGE_SHIFT;
3201         gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3202         gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3203         gfn_t nr_pages_avail;
3204
3205         /* Update ghc->generation before performing any error checks. */
3206         ghc->generation = slots->generation;
3207
3208         if (start_gfn > end_gfn) {
3209                 ghc->hva = KVM_HVA_ERR_BAD;
3210                 return -EINVAL;
3211         }
3212
3213         /*
3214          * If the requested region crosses two memslots, we still
3215          * verify that the entire region is valid here.
3216          */
3217         for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3218                 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3219                 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3220                                            &nr_pages_avail);
3221                 if (kvm_is_error_hva(ghc->hva))
3222                         return -EFAULT;
3223         }
3224
3225         /* Use the slow path for cross page reads and writes. */
3226         if (nr_pages_needed == 1)
3227                 ghc->hva += offset;
3228         else
3229                 ghc->memslot = NULL;
3230
3231         ghc->gpa = gpa;
3232         ghc->len = len;
3233         return 0;
3234 }
3235
3236 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3237                               gpa_t gpa, unsigned long len)
3238 {
3239         struct kvm_memslots *slots = kvm_memslots(kvm);
3240         return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3241 }
3242 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3243
3244 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3245                                   void *data, unsigned int offset,
3246                                   unsigned long len)
3247 {
3248         struct kvm_memslots *slots = kvm_memslots(kvm);
3249         int r;
3250         gpa_t gpa = ghc->gpa + offset;
3251
3252         if (WARN_ON_ONCE(len + offset > ghc->len))
3253                 return -EINVAL;
3254
3255         if (slots->generation != ghc->generation) {
3256                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3257                         return -EFAULT;
3258         }
3259
3260         if (kvm_is_error_hva(ghc->hva))
3261                 return -EFAULT;
3262
3263         if (unlikely(!ghc->memslot))
3264                 return kvm_write_guest(kvm, gpa, data, len);
3265
3266         r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3267         if (r)
3268                 return -EFAULT;
3269         mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3270
3271         return 0;
3272 }
3273 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3274
3275 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3276                            void *data, unsigned long len)
3277 {
3278         return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3279 }
3280 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3281
3282 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3283                                  void *data, unsigned int offset,
3284                                  unsigned long len)
3285 {
3286         struct kvm_memslots *slots = kvm_memslots(kvm);
3287         int r;
3288         gpa_t gpa = ghc->gpa + offset;
3289
3290         if (WARN_ON_ONCE(len + offset > ghc->len))
3291                 return -EINVAL;
3292
3293         if (slots->generation != ghc->generation) {
3294                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3295                         return -EFAULT;
3296         }
3297
3298         if (kvm_is_error_hva(ghc->hva))
3299                 return -EFAULT;
3300
3301         if (unlikely(!ghc->memslot))
3302                 return kvm_read_guest(kvm, gpa, data, len);
3303
3304         r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3305         if (r)
3306                 return -EFAULT;
3307
3308         return 0;
3309 }
3310 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3311
3312 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3313                           void *data, unsigned long len)
3314 {
3315         return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3316 }
3317 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3318
3319 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3320 {
3321         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3322         gfn_t gfn = gpa >> PAGE_SHIFT;
3323         int seg;
3324         int offset = offset_in_page(gpa);
3325         int ret;
3326
3327         while ((seg = next_segment(len, offset)) != 0) {
3328                 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3329                 if (ret < 0)
3330                         return ret;
3331                 offset = 0;
3332                 len -= seg;
3333                 ++gfn;
3334         }
3335         return 0;
3336 }
3337 EXPORT_SYMBOL_GPL(kvm_clear_guest);
3338
3339 void mark_page_dirty_in_slot(struct kvm *kvm,
3340                              const struct kvm_memory_slot *memslot,
3341                              gfn_t gfn)
3342 {
3343         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3344
3345 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3346         if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
3347                 return;
3348
3349         WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3350 #endif
3351
3352         if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3353                 unsigned long rel_gfn = gfn - memslot->base_gfn;
3354                 u32 slot = (memslot->as_id << 16) | memslot->id;
3355
3356                 if (kvm->dirty_ring_size && vcpu)
3357                         kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3358                 else if (memslot->dirty_bitmap)
3359                         set_bit_le(rel_gfn, memslot->dirty_bitmap);
3360         }
3361 }
3362 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3363
3364 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3365 {
3366         struct kvm_memory_slot *memslot;
3367
3368         memslot = gfn_to_memslot(kvm, gfn);
3369         mark_page_dirty_in_slot(kvm, memslot, gfn);
3370 }
3371 EXPORT_SYMBOL_GPL(mark_page_dirty);
3372
3373 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3374 {
3375         struct kvm_memory_slot *memslot;
3376
3377         memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3378         mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3379 }
3380 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3381
3382 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3383 {
3384         if (!vcpu->sigset_active)
3385                 return;
3386
3387         /*
3388          * This does a lockless modification of ->real_blocked, which is fine
3389          * because, only current can change ->real_blocked and all readers of
3390          * ->real_blocked don't care as long ->real_blocked is always a subset
3391          * of ->blocked.
3392          */
3393         sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3394 }
3395
3396 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3397 {
3398         if (!vcpu->sigset_active)
3399                 return;
3400
3401         sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3402         sigemptyset(&current->real_blocked);
3403 }
3404
3405 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3406 {
3407         unsigned int old, val, grow, grow_start;
3408
3409         old = val = vcpu->halt_poll_ns;
3410         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3411         grow = READ_ONCE(halt_poll_ns_grow);
3412         if (!grow)
3413                 goto out;
3414
3415         val *= grow;
3416         if (val < grow_start)
3417                 val = grow_start;
3418
3419         vcpu->halt_poll_ns = val;
3420 out:
3421         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3422 }
3423
3424 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3425 {
3426         unsigned int old, val, shrink, grow_start;
3427
3428         old = val = vcpu->halt_poll_ns;
3429         shrink = READ_ONCE(halt_poll_ns_shrink);
3430         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3431         if (shrink == 0)
3432                 val = 0;
3433         else
3434                 val /= shrink;
3435
3436         if (val < grow_start)
3437                 val = 0;
3438
3439         vcpu->halt_poll_ns = val;
3440         trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3441 }
3442
3443 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3444 {
3445         int ret = -EINTR;
3446         int idx = srcu_read_lock(&vcpu->kvm->srcu);
3447
3448         if (kvm_arch_vcpu_runnable(vcpu))
3449                 goto out;
3450         if (kvm_cpu_has_pending_timer(vcpu))
3451                 goto out;
3452         if (signal_pending(current))
3453                 goto out;
3454         if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3455                 goto out;
3456
3457         ret = 0;
3458 out:
3459         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3460         return ret;
3461 }
3462
3463 /*
3464  * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3465  * pending.  This is mostly used when halting a vCPU, but may also be used
3466  * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3467  */
3468 bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3469 {
3470         struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3471         bool waited = false;
3472
3473         vcpu->stat.generic.blocking = 1;
3474
3475         preempt_disable();
3476         kvm_arch_vcpu_blocking(vcpu);
3477         prepare_to_rcuwait(wait);
3478         preempt_enable();
3479
3480         for (;;) {
3481                 set_current_state(TASK_INTERRUPTIBLE);
3482
3483                 if (kvm_vcpu_check_block(vcpu) < 0)
3484                         break;
3485
3486                 waited = true;
3487                 schedule();
3488         }
3489
3490         preempt_disable();
3491         finish_rcuwait(wait);
3492         kvm_arch_vcpu_unblocking(vcpu);
3493         preempt_enable();
3494
3495         vcpu->stat.generic.blocking = 0;
3496
3497         return waited;
3498 }
3499
3500 static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3501                                           ktime_t end, bool success)
3502 {
3503         struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3504         u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3505
3506         ++vcpu->stat.generic.halt_attempted_poll;
3507
3508         if (success) {
3509                 ++vcpu->stat.generic.halt_successful_poll;
3510
3511                 if (!vcpu_valid_wakeup(vcpu))
3512                         ++vcpu->stat.generic.halt_poll_invalid;
3513
3514                 stats->halt_poll_success_ns += poll_ns;
3515                 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3516         } else {
3517                 stats->halt_poll_fail_ns += poll_ns;
3518                 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3519         }
3520 }
3521
3522 static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3523 {
3524         struct kvm *kvm = vcpu->kvm;
3525
3526         if (kvm->override_halt_poll_ns) {
3527                 /*
3528                  * Ensure kvm->max_halt_poll_ns is not read before
3529                  * kvm->override_halt_poll_ns.
3530                  *
3531                  * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3532                  */
3533                 smp_rmb();
3534                 return READ_ONCE(kvm->max_halt_poll_ns);
3535         }
3536
3537         return READ_ONCE(halt_poll_ns);
3538 }
3539
3540 /*
3541  * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
3542  * polling is enabled, busy wait for a short time before blocking to avoid the
3543  * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3544  * is halted.
3545  */
3546 void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3547 {
3548         unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3549         bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3550         ktime_t start, cur, poll_end;
3551         bool waited = false;
3552         bool do_halt_poll;
3553         u64 halt_ns;
3554
3555         if (vcpu->halt_poll_ns > max_halt_poll_ns)
3556                 vcpu->halt_poll_ns = max_halt_poll_ns;
3557
3558         do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3559
3560         start = cur = poll_end = ktime_get();
3561         if (do_halt_poll) {
3562                 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3563
3564                 do {
3565                         if (kvm_vcpu_check_block(vcpu) < 0)
3566                                 goto out;
3567                         cpu_relax();
3568                         poll_end = cur = ktime_get();
3569                 } while (kvm_vcpu_can_poll(cur, stop));
3570         }
3571
3572         waited = kvm_vcpu_block(vcpu);
3573
3574         cur = ktime_get();
3575         if (waited) {
3576                 vcpu->stat.generic.halt_wait_ns +=
3577                         ktime_to_ns(cur) - ktime_to_ns(poll_end);
3578                 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3579                                 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3580         }
3581 out:
3582         /* The total time the vCPU was "halted", including polling time. */
3583         halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3584
3585         /*
3586          * Note, halt-polling is considered successful so long as the vCPU was
3587          * never actually scheduled out, i.e. even if the wake event arrived
3588          * after of the halt-polling loop itself, but before the full wait.
3589          */
3590         if (do_halt_poll)
3591                 update_halt_poll_stats(vcpu, start, poll_end, !waited);
3592
3593         if (halt_poll_allowed) {
3594                 /* Recompute the max halt poll time in case it changed. */
3595                 max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3596
3597                 if (!vcpu_valid_wakeup(vcpu)) {
3598                         shrink_halt_poll_ns(vcpu);
3599                 } else if (max_halt_poll_ns) {
3600                         if (halt_ns <= vcpu->halt_poll_ns)
3601                                 ;
3602                         /* we had a long block, shrink polling */
3603                         else if (vcpu->halt_poll_ns &&
3604                                  halt_ns > max_halt_poll_ns)
3605                                 shrink_halt_poll_ns(vcpu);
3606                         /* we had a short halt and our poll time is too small */
3607                         else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3608                                  halt_ns < max_halt_poll_ns)
3609                                 grow_halt_poll_ns(vcpu);
3610                 } else {
3611                         vcpu->halt_poll_ns = 0;
3612                 }
3613         }
3614
3615         trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3616 }
3617 EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3618
3619 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3620 {
3621         if (__kvm_vcpu_wake_up(vcpu)) {
3622                 WRITE_ONCE(vcpu->ready, true);
3623                 ++vcpu->stat.generic.halt_wakeup;
3624                 return true;
3625         }
3626
3627         return false;
3628 }
3629 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3630
3631 #ifndef CONFIG_S390
3632 /*
3633  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3634  */
3635 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3636 {
3637         int me, cpu;
3638
3639         if (kvm_vcpu_wake_up(vcpu))
3640                 return;
3641
3642         me = get_cpu();
3643         /*
3644          * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3645          * to EXITING_GUEST_MODE.  Therefore the moderately expensive "should
3646          * kick" check does not need atomic operations if kvm_vcpu_kick is used
3647          * within the vCPU thread itself.
3648          */
3649         if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3650                 if (vcpu->mode == IN_GUEST_MODE)
3651                         WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3652                 goto out;
3653         }
3654
3655         /*
3656          * Note, the vCPU could get migrated to a different pCPU at any point
3657          * after kvm_arch_vcpu_should_kick(), which could result in sending an
3658          * IPI to the previous pCPU.  But, that's ok because the purpose of the
3659          * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3660          * vCPU also requires it to leave IN_GUEST_MODE.
3661          */
3662         if (kvm_arch_vcpu_should_kick(vcpu)) {
3663                 cpu = READ_ONCE(vcpu->cpu);
3664                 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3665                         smp_send_reschedule(cpu);
3666         }
3667 out:
3668         put_cpu();
3669 }
3670 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3671 #endif /* !CONFIG_S390 */
3672
3673 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3674 {
3675         struct pid *pid;
3676         struct task_struct *task = NULL;
3677         int ret = 0;
3678
3679         rcu_read_lock();
3680         pid = rcu_dereference(target->pid);
3681         if (pid)
3682                 task = get_pid_task(pid, PIDTYPE_PID);
3683         rcu_read_unlock();
3684         if (!task)
3685                 return ret;
3686         ret = yield_to(task, 1);
3687         put_task_struct(task);
3688
3689         return ret;
3690 }
3691 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3692
3693 /*
3694  * Helper that checks whether a VCPU is eligible for directed yield.
3695  * Most eligible candidate to yield is decided by following heuristics:
3696  *
3697  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3698  *  (preempted lock holder), indicated by @in_spin_loop.
3699  *  Set at the beginning and cleared at the end of interception/PLE handler.
3700  *
3701  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3702  *  chance last time (mostly it has become eligible now since we have probably
3703  *  yielded to lockholder in last iteration. This is done by toggling
3704  *  @dy_eligible each time a VCPU checked for eligibility.)
3705  *
3706  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3707  *  to preempted lock-holder could result in wrong VCPU selection and CPU
3708  *  burning. Giving priority for a potential lock-holder increases lock
3709  *  progress.
3710  *
3711  *  Since algorithm is based on heuristics, accessing another VCPU data without
3712  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3713  *  and continue with next VCPU and so on.
3714  */
3715 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3716 {
3717 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3718         bool eligible;
3719
3720         eligible = !vcpu->spin_loop.in_spin_loop ||
3721                     vcpu->spin_loop.dy_eligible;
3722
3723         if (vcpu->spin_loop.in_spin_loop)
3724                 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3725
3726         return eligible;
3727 #else
3728         return true;
3729 #endif
3730 }
3731
3732 /*
3733  * Unlike kvm_arch_vcpu_runnable, this function is called outside
3734  * a vcpu_load/vcpu_put pair.  However, for most architectures
3735  * kvm_arch_vcpu_runnable does not require vcpu_load.
3736  */
3737 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3738 {
3739         return kvm_arch_vcpu_runnable(vcpu);
3740 }
3741
3742 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3743 {
3744         if (kvm_arch_dy_runnable(vcpu))
3745                 return true;
3746
3747 #ifdef CONFIG_KVM_ASYNC_PF
3748         if (!list_empty_careful(&vcpu->async_pf.done))
3749                 return true;
3750 #endif
3751
3752         return false;
3753 }
3754
3755 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3756 {
3757         return false;
3758 }
3759
3760 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3761 {
3762         struct kvm *kvm = me->kvm;
3763         struct kvm_vcpu *vcpu;
3764         int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3765         unsigned long i;
3766         int yielded = 0;
3767         int try = 3;
3768         int pass;
3769
3770         kvm_vcpu_set_in_spin_loop(me, true);
3771         /*
3772          * We boost the priority of a VCPU that is runnable but not
3773          * currently running, because it got preempted by something
3774          * else and called schedule in __vcpu_run.  Hopefully that
3775          * VCPU is holding the lock that we need and will release it.
3776          * We approximate round-robin by starting at the last boosted VCPU.
3777          */
3778         for (pass = 0; pass < 2 && !yielded && try; pass++) {
3779                 kvm_for_each_vcpu(i, vcpu, kvm) {
3780                         if (!pass && i <= last_boosted_vcpu) {
3781                                 i = last_boosted_vcpu;
3782                                 continue;
3783                         } else if (pass && i > last_boosted_vcpu)
3784                                 break;
3785                         if (!READ_ONCE(vcpu->ready))
3786                                 continue;
3787                         if (vcpu == me)
3788                                 continue;
3789                         if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3790                                 continue;
3791                         if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3792                             !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3793                             !kvm_arch_vcpu_in_kernel(vcpu))
3794                                 continue;
3795                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3796                                 continue;
3797
3798                         yielded = kvm_vcpu_yield_to(vcpu);
3799                         if (yielded > 0) {
3800                                 kvm->last_boosted_vcpu = i;
3801                                 break;
3802                         } else if (yielded < 0) {
3803                                 try--;
3804                                 if (!try)
3805                                         break;
3806                         }
3807                 }
3808         }
3809         kvm_vcpu_set_in_spin_loop(me, false);
3810
3811         /* Ensure vcpu is not eligible during next spinloop */
3812         kvm_vcpu_set_dy_eligible(me, false);
3813 }
3814 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3815
3816 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3817 {
3818 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3819         return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3820             (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3821              kvm->dirty_ring_size / PAGE_SIZE);
3822 #else
3823         return false;
3824 #endif
3825 }
3826
3827 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3828 {
3829         struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3830         struct page *page;
3831
3832         if (vmf->pgoff == 0)
3833                 page = virt_to_page(vcpu->run);
3834 #ifdef CONFIG_X86
3835         else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3836                 page = virt_to_page(vcpu->arch.pio_data);
3837 #endif
3838 #ifdef CONFIG_KVM_MMIO
3839         else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3840                 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3841 #endif
3842         else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3843                 page = kvm_dirty_ring_get_page(
3844                     &vcpu->dirty_ring,
3845                     vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3846         else
3847                 return kvm_arch_vcpu_fault(vcpu, vmf);
3848         get_page(page);
3849         vmf->page = page;
3850         return 0;
3851 }
3852
3853 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3854         .fault = kvm_vcpu_fault,
3855 };
3856
3857 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3858 {
3859         struct kvm_vcpu *vcpu = file->private_data;
3860         unsigned long pages = vma_pages(vma);
3861
3862         if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3863              kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3864             ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3865                 return -EINVAL;
3866
3867         vma->vm_ops = &kvm_vcpu_vm_ops;
3868         return 0;
3869 }
3870
3871 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3872 {
3873         struct kvm_vcpu *vcpu = filp->private_data;
3874
3875         kvm_put_kvm(vcpu->kvm);
3876         return 0;
3877 }
3878
3879 static const struct file_operations kvm_vcpu_fops = {
3880         .release        = kvm_vcpu_release,
3881         .unlocked_ioctl = kvm_vcpu_ioctl,
3882         .mmap           = kvm_vcpu_mmap,
3883         .llseek         = noop_llseek,
3884         KVM_COMPAT(kvm_vcpu_compat_ioctl),
3885 };
3886
3887 /*
3888  * Allocates an inode for the vcpu.
3889  */
3890 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3891 {
3892         char name[8 + 1 + ITOA_MAX_LEN + 1];
3893
3894         snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3895         return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3896 }
3897
3898 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3899 static int vcpu_get_pid(void *data, u64 *val)
3900 {
3901         struct kvm_vcpu *vcpu = data;
3902
3903         rcu_read_lock();
3904         *val = pid_nr(rcu_dereference(vcpu->pid));
3905         rcu_read_unlock();
3906         return 0;
3907 }
3908
3909 DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
3910
3911 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3912 {
3913         struct dentry *debugfs_dentry;
3914         char dir_name[ITOA_MAX_LEN * 2];
3915
3916         if (!debugfs_initialized())
3917                 return;
3918
3919         snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3920         debugfs_dentry = debugfs_create_dir(dir_name,
3921                                             vcpu->kvm->debugfs_dentry);
3922         debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
3923                             &vcpu_get_pid_fops);
3924
3925         kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3926 }
3927 #endif
3928
3929 /*
3930  * Creates some virtual cpus.  Good luck creating more than one.
3931  */
3932 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3933 {
3934         int r;
3935         struct kvm_vcpu *vcpu;
3936         struct page *page;
3937
3938         if (id >= KVM_MAX_VCPU_IDS)
3939                 return -EINVAL;
3940
3941         mutex_lock(&kvm->lock);
3942         if (kvm->created_vcpus >= kvm->max_vcpus) {
3943                 mutex_unlock(&kvm->lock);
3944                 return -EINVAL;
3945         }
3946
3947         r = kvm_arch_vcpu_precreate(kvm, id);
3948         if (r) {
3949                 mutex_unlock(&kvm->lock);
3950                 return r;
3951         }
3952
3953         kvm->created_vcpus++;
3954         mutex_unlock(&kvm->lock);
3955
3956         vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3957         if (!vcpu) {
3958                 r = -ENOMEM;
3959                 goto vcpu_decrement;
3960         }
3961
3962         BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3963         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3964         if (!page) {
3965                 r = -ENOMEM;
3966                 goto vcpu_free;
3967         }
3968         vcpu->run = page_address(page);
3969
3970         kvm_vcpu_init(vcpu, kvm, id);
3971
3972         r = kvm_arch_vcpu_create(vcpu);
3973         if (r)
3974                 goto vcpu_free_run_page;
3975
3976         if (kvm->dirty_ring_size) {
3977                 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3978                                          id, kvm->dirty_ring_size);
3979                 if (r)
3980                         goto arch_vcpu_destroy;
3981         }
3982
3983         mutex_lock(&kvm->lock);
3984
3985 #ifdef CONFIG_LOCKDEP
3986         /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
3987         mutex_lock(&vcpu->mutex);
3988         mutex_unlock(&vcpu->mutex);
3989 #endif
3990
3991         if (kvm_get_vcpu_by_id(kvm, id)) {
3992                 r = -EEXIST;
3993                 goto unlock_vcpu_destroy;
3994         }
3995
3996         vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3997         r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
3998         if (r)
3999                 goto unlock_vcpu_destroy;
4000
4001         /* Now it's all set up, let userspace reach it */
4002         kvm_get_kvm(kvm);
4003         r = create_vcpu_fd(vcpu);
4004         if (r < 0)
4005                 goto kvm_put_xa_release;
4006
4007         if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
4008                 r = -EINVAL;
4009                 goto kvm_put_xa_release;
4010         }
4011
4012         /*
4013          * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
4014          * pointer before kvm->online_vcpu's incremented value.
4015          */
4016         smp_wmb();
4017         atomic_inc(&kvm->online_vcpus);
4018
4019         mutex_unlock(&kvm->lock);
4020         kvm_arch_vcpu_postcreate(vcpu);
4021         kvm_create_vcpu_debugfs(vcpu);
4022         return r;
4023
4024 kvm_put_xa_release:
4025         kvm_put_kvm_no_destroy(kvm);
4026         xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
4027 unlock_vcpu_destroy:
4028         mutex_unlock(&kvm->lock);
4029         kvm_dirty_ring_free(&vcpu->dirty_ring);
4030 arch_vcpu_destroy:
4031         kvm_arch_vcpu_destroy(vcpu);
4032 vcpu_free_run_page:
4033         free_page((unsigned long)vcpu->run);
4034 vcpu_free:
4035         kmem_cache_free(kvm_vcpu_cache, vcpu);
4036 vcpu_decrement:
4037         mutex_lock(&kvm->lock);
4038         kvm->created_vcpus--;
4039         mutex_unlock(&kvm->lock);
4040         return r;
4041 }
4042
4043 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4044 {
4045         if (sigset) {
4046                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4047                 vcpu->sigset_active = 1;
4048                 vcpu->sigset = *sigset;
4049         } else
4050                 vcpu->sigset_active = 0;
4051         return 0;
4052 }
4053
4054 static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4055                               size_t size, loff_t *offset)
4056 {
4057         struct kvm_vcpu *vcpu = file->private_data;
4058
4059         return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4060                         &kvm_vcpu_stats_desc[0], &vcpu->stat,
4061                         sizeof(vcpu->stat), user_buffer, size, offset);
4062 }
4063
4064 static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4065 {
4066         struct kvm_vcpu *vcpu = file->private_data;
4067
4068         kvm_put_kvm(vcpu->kvm);
4069         return 0;
4070 }
4071
4072 static const struct file_operations kvm_vcpu_stats_fops = {
4073         .read = kvm_vcpu_stats_read,
4074         .release = kvm_vcpu_stats_release,
4075         .llseek = noop_llseek,
4076 };
4077
4078 static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4079 {
4080         int fd;
4081         struct file *file;
4082         char name[15 + ITOA_MAX_LEN + 1];
4083
4084         snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4085
4086         fd = get_unused_fd_flags(O_CLOEXEC);
4087         if (fd < 0)
4088                 return fd;
4089
4090         file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4091         if (IS_ERR(file)) {
4092                 put_unused_fd(fd);
4093                 return PTR_ERR(file);
4094         }
4095
4096         kvm_get_kvm(vcpu->kvm);
4097
4098         file->f_mode |= FMODE_PREAD;
4099         fd_install(fd, file);
4100
4101         return fd;
4102 }
4103
4104 static long kvm_vcpu_ioctl(struct file *filp,
4105                            unsigned int ioctl, unsigned long arg)
4106 {
4107         struct kvm_vcpu *vcpu = filp->private_data;
4108         void __user *argp = (void __user *)arg;
4109         int r;
4110         struct kvm_fpu *fpu = NULL;
4111         struct kvm_sregs *kvm_sregs = NULL;
4112
4113         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4114                 return -EIO;
4115
4116         if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4117                 return -EINVAL;
4118
4119         /*
4120          * Some architectures have vcpu ioctls that are asynchronous to vcpu
4121          * execution; mutex_lock() would break them.
4122          */
4123         r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4124         if (r != -ENOIOCTLCMD)
4125                 return r;
4126
4127         if (mutex_lock_killable(&vcpu->mutex))
4128                 return -EINTR;
4129         switch (ioctl) {
4130         case KVM_RUN: {
4131                 struct pid *oldpid;
4132                 r = -EINVAL;
4133                 if (arg)
4134                         goto out;
4135                 oldpid = rcu_access_pointer(vcpu->pid);
4136                 if (unlikely(oldpid != task_pid(current))) {
4137                         /* The thread running this VCPU changed. */
4138                         struct pid *newpid;
4139
4140                         r = kvm_arch_vcpu_run_pid_change(vcpu);
4141                         if (r)
4142                                 break;
4143
4144                         newpid = get_task_pid(current, PIDTYPE_PID);
4145                         rcu_assign_pointer(vcpu->pid, newpid);
4146                         if (oldpid)
4147                                 synchronize_rcu();
4148                         put_pid(oldpid);
4149                 }
4150                 r = kvm_arch_vcpu_ioctl_run(vcpu);
4151                 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
4152                 break;
4153         }
4154         case KVM_GET_REGS: {
4155                 struct kvm_regs *kvm_regs;
4156
4157                 r = -ENOMEM;
4158                 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
4159                 if (!kvm_regs)
4160                         goto out;
4161                 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4162                 if (r)
4163                         goto out_free1;
4164                 r = -EFAULT;
4165                 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4166                         goto out_free1;
4167                 r = 0;
4168 out_free1:
4169                 kfree(kvm_regs);
4170                 break;
4171         }
4172         case KVM_SET_REGS: {
4173                 struct kvm_regs *kvm_regs;
4174
4175                 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4176                 if (IS_ERR(kvm_regs)) {
4177                         r = PTR_ERR(kvm_regs);
4178                         goto out;
4179                 }
4180                 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
4181                 kfree(kvm_regs);
4182                 break;
4183         }
4184         case KVM_GET_SREGS: {
4185                 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4186                                     GFP_KERNEL_ACCOUNT);
4187                 r = -ENOMEM;
4188                 if (!kvm_sregs)
4189                         goto out;
4190                 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
4191                 if (r)
4192                         goto out;
4193                 r = -EFAULT;
4194                 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
4195                         goto out;
4196                 r = 0;
4197                 break;
4198         }
4199         case KVM_SET_SREGS: {
4200                 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4201                 if (IS_ERR(kvm_sregs)) {
4202                         r = PTR_ERR(kvm_sregs);
4203                         kvm_sregs = NULL;
4204                         goto out;
4205                 }
4206                 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
4207                 break;
4208         }
4209         case KVM_GET_MP_STATE: {
4210                 struct kvm_mp_state mp_state;
4211
4212                 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4213                 if (r)
4214                         goto out;
4215                 r = -EFAULT;
4216                 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
4217                         goto out;
4218                 r = 0;
4219                 break;
4220         }
4221         case KVM_SET_MP_STATE: {
4222                 struct kvm_mp_state mp_state;
4223
4224                 r = -EFAULT;
4225                 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
4226                         goto out;
4227                 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
4228                 break;
4229         }
4230         case KVM_TRANSLATE: {
4231                 struct kvm_translation tr;
4232
4233                 r = -EFAULT;
4234                 if (copy_from_user(&tr, argp, sizeof(tr)))
4235                         goto out;
4236                 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
4237                 if (r)
4238                         goto out;
4239                 r = -EFAULT;
4240                 if (copy_to_user(argp, &tr, sizeof(tr)))
4241                         goto out;
4242                 r = 0;
4243                 break;
4244         }
4245         case KVM_SET_GUEST_DEBUG: {
4246                 struct kvm_guest_debug dbg;
4247
4248                 r = -EFAULT;
4249                 if (copy_from_user(&dbg, argp, sizeof(dbg)))
4250                         goto out;
4251                 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4252                 break;
4253         }
4254         case KVM_SET_SIGNAL_MASK: {
4255                 struct kvm_signal_mask __user *sigmask_arg = argp;
4256                 struct kvm_signal_mask kvm_sigmask;
4257                 sigset_t sigset, *p;
4258
4259                 p = NULL;
4260                 if (argp) {
4261                         r = -EFAULT;
4262                         if (copy_from_user(&kvm_sigmask, argp,
4263                                            sizeof(kvm_sigmask)))
4264                                 goto out;
4265                         r = -EINVAL;
4266                         if (kvm_sigmask.len != sizeof(sigset))
4267                                 goto out;
4268                         r = -EFAULT;
4269                         if (copy_from_user(&sigset, sigmask_arg->sigset,
4270                                            sizeof(sigset)))
4271                                 goto out;
4272                         p = &sigset;
4273                 }
4274                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4275                 break;
4276         }
4277         case KVM_GET_FPU: {
4278                 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4279                 r = -ENOMEM;
4280                 if (!fpu)
4281                         goto out;
4282                 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4283                 if (r)
4284                         goto out;
4285                 r = -EFAULT;
4286                 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4287                         goto out;
4288                 r = 0;
4289                 break;
4290         }
4291         case KVM_SET_FPU: {
4292                 fpu = memdup_user(argp, sizeof(*fpu));
4293                 if (IS_ERR(fpu)) {
4294                         r = PTR_ERR(fpu);
4295                         fpu = NULL;
4296                         goto out;
4297                 }
4298                 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4299                 break;
4300         }
4301         case KVM_GET_STATS_FD: {
4302                 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4303                 break;
4304         }
4305         default:
4306                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4307         }
4308 out:
4309         mutex_unlock(&vcpu->mutex);
4310         kfree(fpu);
4311         kfree(kvm_sregs);
4312         return r;
4313 }
4314
4315 #ifdef CONFIG_KVM_COMPAT
4316 static long kvm_vcpu_compat_ioctl(struct file *filp,
4317                                   unsigned int ioctl, unsigned long arg)
4318 {
4319         struct kvm_vcpu *vcpu = filp->private_data;
4320         void __user *argp = compat_ptr(arg);
4321         int r;
4322
4323         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4324                 return -EIO;
4325
4326         switch (ioctl) {
4327         case KVM_SET_SIGNAL_MASK: {
4328                 struct kvm_signal_mask __user *sigmask_arg = argp;
4329                 struct kvm_signal_mask kvm_sigmask;
4330                 sigset_t sigset;
4331
4332                 if (argp) {
4333                         r = -EFAULT;
4334                         if (copy_from_user(&kvm_sigmask, argp,
4335                                            sizeof(kvm_sigmask)))
4336                                 goto out;
4337                         r = -EINVAL;
4338                         if (kvm_sigmask.len != sizeof(compat_sigset_t))
4339                                 goto out;
4340                         r = -EFAULT;
4341                         if (get_compat_sigset(&sigset,
4342                                               (compat_sigset_t __user *)sigmask_arg->sigset))
4343                                 goto out;
4344                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4345                 } else
4346                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4347                 break;
4348         }
4349         default:
4350                 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4351         }
4352
4353 out:
4354         return r;
4355 }
4356 #endif
4357
4358 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4359 {
4360         struct kvm_device *dev = filp->private_data;
4361
4362         if (dev->ops->mmap)
4363                 return dev->ops->mmap(dev, vma);
4364
4365         return -ENODEV;
4366 }
4367
4368 static int kvm_device_ioctl_attr(struct kvm_device *dev,
4369                                  int (*accessor)(struct kvm_device *dev,
4370                                                  struct kvm_device_attr *attr),
4371                                  unsigned long arg)
4372 {
4373         struct kvm_device_attr attr;
4374
4375         if (!accessor)
4376                 return -EPERM;
4377
4378         if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4379                 return -EFAULT;
4380
4381         return accessor(dev, &attr);
4382 }
4383
4384 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4385                              unsigned long arg)
4386 {
4387         struct kvm_device *dev = filp->private_data;
4388
4389         if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4390                 return -EIO;
4391
4392         switch (ioctl) {
4393         case KVM_SET_DEVICE_ATTR:
4394                 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4395         case KVM_GET_DEVICE_ATTR:
4396                 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4397         case KVM_HAS_DEVICE_ATTR:
4398                 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4399         default:
4400                 if (dev->ops->ioctl)
4401                         return dev->ops->ioctl(dev, ioctl, arg);
4402
4403                 return -ENOTTY;
4404         }
4405 }
4406
4407 static int kvm_device_release(struct inode *inode, struct file *filp)
4408 {
4409         struct kvm_device *dev = filp->private_data;
4410         struct kvm *kvm = dev->kvm;
4411
4412         if (dev->ops->release) {
4413                 mutex_lock(&kvm->lock);
4414                 list_del(&dev->vm_node);
4415                 dev->ops->release(dev);
4416                 mutex_unlock(&kvm->lock);
4417         }
4418
4419         kvm_put_kvm(kvm);
4420         return 0;
4421 }
4422
4423 static const struct file_operations kvm_device_fops = {
4424         .unlocked_ioctl = kvm_device_ioctl,
4425         .release = kvm_device_release,
4426         KVM_COMPAT(kvm_device_ioctl),
4427         .mmap = kvm_device_mmap,
4428 };
4429
4430 struct kvm_device *kvm_device_from_filp(struct file *filp)
4431 {
4432         if (filp->f_op != &kvm_device_fops)
4433                 return NULL;
4434
4435         return filp->private_data;
4436 }
4437
4438 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4439 #ifdef CONFIG_KVM_MPIC
4440         [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
4441         [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
4442 #endif
4443 };
4444
4445 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4446 {
4447         if (type >= ARRAY_SIZE(kvm_device_ops_table))
4448                 return -ENOSPC;
4449
4450         if (kvm_device_ops_table[type] != NULL)
4451                 return -EEXIST;
4452
4453         kvm_device_ops_table[type] = ops;
4454         return 0;
4455 }
4456
4457 void kvm_unregister_device_ops(u32 type)
4458 {
4459         if (kvm_device_ops_table[type] != NULL)
4460                 kvm_device_ops_table[type] = NULL;
4461 }
4462
4463 static int kvm_ioctl_create_device(struct kvm *kvm,
4464                                    struct kvm_create_device *cd)
4465 {
4466         const struct kvm_device_ops *ops;
4467         struct kvm_device *dev;
4468         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4469         int type;
4470         int ret;
4471
4472         if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4473                 return -ENODEV;
4474
4475         type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4476         ops = kvm_device_ops_table[type];
4477         if (ops == NULL)
4478                 return -ENODEV;
4479
4480         if (test)
4481                 return 0;
4482
4483         dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4484         if (!dev)
4485                 return -ENOMEM;
4486
4487         dev->ops = ops;
4488         dev->kvm = kvm;
4489
4490         mutex_lock(&kvm->lock);
4491         ret = ops->create(dev, type);
4492         if (ret < 0) {
4493                 mutex_unlock(&kvm->lock);
4494                 kfree(dev);
4495                 return ret;
4496         }
4497         list_add(&dev->vm_node, &kvm->devices);
4498         mutex_unlock(&kvm->lock);
4499
4500         if (ops->init)
4501                 ops->init(dev);
4502
4503         kvm_get_kvm(kvm);
4504         ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4505         if (ret < 0) {
4506                 kvm_put_kvm_no_destroy(kvm);
4507                 mutex_lock(&kvm->lock);
4508                 list_del(&dev->vm_node);
4509                 if (ops->release)
4510                         ops->release(dev);
4511                 mutex_unlock(&kvm->lock);
4512                 if (ops->destroy)
4513                         ops->destroy(dev);
4514                 return ret;
4515         }
4516
4517         cd->fd = ret;
4518         return 0;
4519 }
4520
4521 static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4522 {
4523         switch (arg) {
4524         case KVM_CAP_USER_MEMORY:
4525         case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4526         case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4527         case KVM_CAP_INTERNAL_ERROR_DATA:
4528 #ifdef CONFIG_HAVE_KVM_MSI
4529         case KVM_CAP_SIGNAL_MSI:
4530 #endif
4531 #ifdef CONFIG_HAVE_KVM_IRQFD
4532         case KVM_CAP_IRQFD:
4533 #endif
4534         case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4535         case KVM_CAP_CHECK_EXTENSION_VM:
4536         case KVM_CAP_ENABLE_CAP_VM:
4537         case KVM_CAP_HALT_POLL:
4538                 return 1;
4539 #ifdef CONFIG_KVM_MMIO
4540         case KVM_CAP_COALESCED_MMIO:
4541                 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4542         case KVM_CAP_COALESCED_PIO:
4543                 return 1;
4544 #endif
4545 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4546         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4547                 return KVM_DIRTY_LOG_MANUAL_CAPS;
4548 #endif
4549 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4550         case KVM_CAP_IRQ_ROUTING:
4551                 return KVM_MAX_IRQ_ROUTES;
4552 #endif
4553 #if KVM_ADDRESS_SPACE_NUM > 1
4554         case KVM_CAP_MULTI_ADDRESS_SPACE:
4555                 return KVM_ADDRESS_SPACE_NUM;
4556 #endif
4557         case KVM_CAP_NR_MEMSLOTS:
4558                 return KVM_USER_MEM_SLOTS;
4559         case KVM_CAP_DIRTY_LOG_RING:
4560 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4561                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4562 #else
4563                 return 0;
4564 #endif
4565         case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4566 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4567                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4568 #else
4569                 return 0;
4570 #endif
4571 #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4572         case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
4573 #endif
4574         case KVM_CAP_BINARY_STATS_FD:
4575         case KVM_CAP_SYSTEM_EVENT_DATA:
4576                 return 1;
4577         default:
4578                 break;
4579         }
4580         return kvm_vm_ioctl_check_extension(kvm, arg);
4581 }
4582
4583 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4584 {
4585         int r;
4586
4587         if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4588                 return -EINVAL;
4589
4590         /* the size should be power of 2 */
4591         if (!size || (size & (size - 1)))
4592                 return -EINVAL;
4593
4594         /* Should be bigger to keep the reserved entries, or a page */
4595         if (size < kvm_dirty_ring_get_rsvd_entries() *
4596             sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4597                 return -EINVAL;
4598
4599         if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4600             sizeof(struct kvm_dirty_gfn))
4601                 return -E2BIG;
4602
4603         /* We only allow it to set once */
4604         if (kvm->dirty_ring_size)
4605                 return -EINVAL;
4606
4607         mutex_lock(&kvm->lock);
4608
4609         if (kvm->created_vcpus) {
4610                 /* We don't allow to change this value after vcpu created */
4611                 r = -EINVAL;
4612         } else {
4613                 kvm->dirty_ring_size = size;
4614                 r = 0;
4615         }
4616
4617         mutex_unlock(&kvm->lock);
4618         return r;
4619 }
4620
4621 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4622 {
4623         unsigned long i;
4624         struct kvm_vcpu *vcpu;
4625         int cleared = 0;
4626
4627         if (!kvm->dirty_ring_size)
4628                 return -EINVAL;
4629
4630         mutex_lock(&kvm->slots_lock);
4631
4632         kvm_for_each_vcpu(i, vcpu, kvm)
4633                 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4634
4635         mutex_unlock(&kvm->slots_lock);
4636
4637         if (cleared)
4638                 kvm_flush_remote_tlbs(kvm);
4639
4640         return cleared;
4641 }
4642
4643 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4644                                                   struct kvm_enable_cap *cap)
4645 {
4646         return -EINVAL;
4647 }
4648
4649 bool kvm_are_all_memslots_empty(struct kvm *kvm)
4650 {
4651         int i;
4652
4653         lockdep_assert_held(&kvm->slots_lock);
4654
4655         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
4656                 if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
4657                         return false;
4658         }
4659
4660         return true;
4661 }
4662 EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
4663
4664 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4665                                            struct kvm_enable_cap *cap)
4666 {
4667         switch (cap->cap) {
4668 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4669         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4670                 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4671
4672                 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4673                         allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4674
4675                 if (cap->flags || (cap->args[0] & ~allowed_options))
4676                         return -EINVAL;
4677                 kvm->manual_dirty_log_protect = cap->args[0];
4678                 return 0;
4679         }
4680 #endif
4681         case KVM_CAP_HALT_POLL: {
4682                 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4683                         return -EINVAL;
4684
4685                 kvm->max_halt_poll_ns = cap->args[0];
4686
4687                 /*
4688                  * Ensure kvm->override_halt_poll_ns does not become visible
4689                  * before kvm->max_halt_poll_ns.
4690                  *
4691                  * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
4692                  */
4693                 smp_wmb();
4694                 kvm->override_halt_poll_ns = true;
4695
4696                 return 0;
4697         }
4698         case KVM_CAP_DIRTY_LOG_RING:
4699         case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4700                 if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
4701                         return -EINVAL;
4702
4703                 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4704         case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
4705                 int r = -EINVAL;
4706
4707                 if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
4708                     !kvm->dirty_ring_size || cap->flags)
4709                         return r;
4710
4711                 mutex_lock(&kvm->slots_lock);
4712
4713                 /*
4714                  * For simplicity, allow enabling ring+bitmap if and only if
4715                  * there are no memslots, e.g. to ensure all memslots allocate
4716                  * a bitmap after the capability is enabled.
4717                  */
4718                 if (kvm_are_all_memslots_empty(kvm)) {
4719                         kvm->dirty_ring_with_bitmap = true;
4720                         r = 0;
4721                 }
4722
4723                 mutex_unlock(&kvm->slots_lock);
4724
4725                 return r;
4726         }
4727         default:
4728                 return kvm_vm_ioctl_enable_cap(kvm, cap);
4729         }
4730 }
4731
4732 static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4733                               size_t size, loff_t *offset)
4734 {
4735         struct kvm *kvm = file->private_data;
4736
4737         return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4738                                 &kvm_vm_stats_desc[0], &kvm->stat,
4739                                 sizeof(kvm->stat), user_buffer, size, offset);
4740 }
4741
4742 static int kvm_vm_stats_release(struct inode *inode, struct file *file)
4743 {
4744         struct kvm *kvm = file->private_data;
4745
4746         kvm_put_kvm(kvm);
4747         return 0;
4748 }
4749
4750 static const struct file_operations kvm_vm_stats_fops = {
4751         .read = kvm_vm_stats_read,
4752         .release = kvm_vm_stats_release,
4753         .llseek = noop_llseek,
4754 };
4755
4756 static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4757 {
4758         int fd;
4759         struct file *file;
4760
4761         fd = get_unused_fd_flags(O_CLOEXEC);
4762         if (fd < 0)
4763                 return fd;
4764
4765         file = anon_inode_getfile("kvm-vm-stats",
4766                         &kvm_vm_stats_fops, kvm, O_RDONLY);
4767         if (IS_ERR(file)) {
4768                 put_unused_fd(fd);
4769                 return PTR_ERR(file);
4770         }
4771
4772         kvm_get_kvm(kvm);
4773
4774         file->f_mode |= FMODE_PREAD;
4775         fd_install(fd, file);
4776
4777         return fd;
4778 }
4779
4780 static long kvm_vm_ioctl(struct file *filp,
4781                            unsigned int ioctl, unsigned long arg)
4782 {
4783         struct kvm *kvm = filp->private_data;
4784         void __user *argp = (void __user *)arg;
4785         int r;
4786
4787         if (kvm->mm != current->mm || kvm->vm_dead)
4788                 return -EIO;
4789         switch (ioctl) {
4790         case KVM_CREATE_VCPU:
4791                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4792                 break;
4793         case KVM_ENABLE_CAP: {
4794                 struct kvm_enable_cap cap;
4795
4796                 r = -EFAULT;
4797                 if (copy_from_user(&cap, argp, sizeof(cap)))
4798                         goto out;
4799                 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4800                 break;
4801         }
4802         case KVM_SET_USER_MEMORY_REGION: {
4803                 struct kvm_userspace_memory_region kvm_userspace_mem;
4804
4805                 r = -EFAULT;
4806                 if (copy_from_user(&kvm_userspace_mem, argp,
4807                                                 sizeof(kvm_userspace_mem)))
4808                         goto out;
4809
4810                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4811                 break;
4812         }
4813         case KVM_GET_DIRTY_LOG: {
4814                 struct kvm_dirty_log log;
4815
4816                 r = -EFAULT;
4817                 if (copy_from_user(&log, argp, sizeof(log)))
4818                         goto out;
4819                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4820                 break;
4821         }
4822 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4823         case KVM_CLEAR_DIRTY_LOG: {
4824                 struct kvm_clear_dirty_log log;
4825
4826                 r = -EFAULT;
4827                 if (copy_from_user(&log, argp, sizeof(log)))
4828                         goto out;
4829                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4830                 break;
4831         }
4832 #endif
4833 #ifdef CONFIG_KVM_MMIO
4834         case KVM_REGISTER_COALESCED_MMIO: {
4835                 struct kvm_coalesced_mmio_zone zone;
4836
4837                 r = -EFAULT;
4838                 if (copy_from_user(&zone, argp, sizeof(zone)))
4839                         goto out;
4840                 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4841                 break;
4842         }
4843         case KVM_UNREGISTER_COALESCED_MMIO: {
4844                 struct kvm_coalesced_mmio_zone zone;
4845
4846                 r = -EFAULT;
4847                 if (copy_from_user(&zone, argp, sizeof(zone)))
4848                         goto out;
4849                 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4850                 break;
4851         }
4852 #endif
4853         case KVM_IRQFD: {
4854                 struct kvm_irqfd data;
4855
4856                 r = -EFAULT;
4857                 if (copy_from_user(&data, argp, sizeof(data)))
4858                         goto out;
4859                 r = kvm_irqfd(kvm, &data);
4860                 break;
4861         }
4862         case KVM_IOEVENTFD: {
4863                 struct kvm_ioeventfd data;
4864
4865                 r = -EFAULT;
4866                 if (copy_from_user(&data, argp, sizeof(data)))
4867                         goto out;
4868                 r = kvm_ioeventfd(kvm, &data);
4869                 break;
4870         }
4871 #ifdef CONFIG_HAVE_KVM_MSI
4872         case KVM_SIGNAL_MSI: {
4873                 struct kvm_msi msi;
4874
4875                 r = -EFAULT;
4876                 if (copy_from_user(&msi, argp, sizeof(msi)))
4877                         goto out;
4878                 r = kvm_send_userspace_msi(kvm, &msi);
4879                 break;
4880         }
4881 #endif
4882 #ifdef __KVM_HAVE_IRQ_LINE
4883         case KVM_IRQ_LINE_STATUS:
4884         case KVM_IRQ_LINE: {
4885                 struct kvm_irq_level irq_event;
4886
4887                 r = -EFAULT;
4888                 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4889                         goto out;
4890
4891                 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4892                                         ioctl == KVM_IRQ_LINE_STATUS);
4893                 if (r)
4894                         goto out;
4895
4896                 r = -EFAULT;
4897                 if (ioctl == KVM_IRQ_LINE_STATUS) {
4898                         if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4899                                 goto out;
4900                 }
4901
4902                 r = 0;
4903                 break;
4904         }
4905 #endif
4906 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4907         case KVM_SET_GSI_ROUTING: {
4908                 struct kvm_irq_routing routing;
4909                 struct kvm_irq_routing __user *urouting;
4910                 struct kvm_irq_routing_entry *entries = NULL;
4911
4912                 r = -EFAULT;
4913                 if (copy_from_user(&routing, argp, sizeof(routing)))
4914                         goto out;
4915                 r = -EINVAL;
4916                 if (!kvm_arch_can_set_irq_routing(kvm))
4917                         goto out;
4918                 if (routing.nr > KVM_MAX_IRQ_ROUTES)
4919                         goto out;
4920                 if (routing.flags)
4921                         goto out;
4922                 if (routing.nr) {
4923                         urouting = argp;
4924                         entries = vmemdup_user(urouting->entries,
4925                                                array_size(sizeof(*entries),
4926                                                           routing.nr));
4927                         if (IS_ERR(entries)) {
4928                                 r = PTR_ERR(entries);
4929                                 goto out;
4930                         }
4931                 }
4932                 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4933                                         routing.flags);
4934                 kvfree(entries);
4935                 break;
4936         }
4937 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4938         case KVM_CREATE_DEVICE: {
4939                 struct kvm_create_device cd;
4940
4941                 r = -EFAULT;
4942                 if (copy_from_user(&cd, argp, sizeof(cd)))
4943                         goto out;
4944
4945                 r = kvm_ioctl_create_device(kvm, &cd);
4946                 if (r)
4947                         goto out;
4948
4949                 r = -EFAULT;
4950                 if (copy_to_user(argp, &cd, sizeof(cd)))
4951                         goto out;
4952
4953                 r = 0;
4954                 break;
4955         }
4956         case KVM_CHECK_EXTENSION:
4957                 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4958                 break;
4959         case KVM_RESET_DIRTY_RINGS:
4960                 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4961                 break;
4962         case KVM_GET_STATS_FD:
4963                 r = kvm_vm_ioctl_get_stats_fd(kvm);
4964                 break;
4965         default:
4966                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4967         }
4968 out:
4969         return r;
4970 }
4971
4972 #ifdef CONFIG_KVM_COMPAT
4973 struct compat_kvm_dirty_log {
4974         __u32 slot;
4975         __u32 padding1;
4976         union {
4977                 compat_uptr_t dirty_bitmap; /* one bit per page */
4978                 __u64 padding2;
4979         };
4980 };
4981
4982 struct compat_kvm_clear_dirty_log {
4983         __u32 slot;
4984         __u32 num_pages;
4985         __u64 first_page;
4986         union {
4987                 compat_uptr_t dirty_bitmap; /* one bit per page */
4988                 __u64 padding2;
4989         };
4990 };
4991
4992 long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
4993                                      unsigned long arg)
4994 {
4995         return -ENOTTY;
4996 }
4997
4998 static long kvm_vm_compat_ioctl(struct file *filp,
4999                            unsigned int ioctl, unsigned long arg)
5000 {
5001         struct kvm *kvm = filp->private_data;
5002         int r;
5003
5004         if (kvm->mm != current->mm || kvm->vm_dead)
5005                 return -EIO;
5006
5007         r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5008         if (r != -ENOTTY)
5009                 return r;
5010
5011         switch (ioctl) {
5012 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5013         case KVM_CLEAR_DIRTY_LOG: {
5014                 struct compat_kvm_clear_dirty_log compat_log;
5015                 struct kvm_clear_dirty_log log;
5016
5017                 if (copy_from_user(&compat_log, (void __user *)arg,
5018                                    sizeof(compat_log)))
5019                         return -EFAULT;
5020                 log.slot         = compat_log.slot;
5021                 log.num_pages    = compat_log.num_pages;
5022                 log.first_page   = compat_log.first_page;
5023                 log.padding2     = compat_log.padding2;
5024                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5025
5026                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5027                 break;
5028         }
5029 #endif
5030         case KVM_GET_DIRTY_LOG: {
5031                 struct compat_kvm_dirty_log compat_log;
5032                 struct kvm_dirty_log log;
5033
5034                 if (copy_from_user(&compat_log, (void __user *)arg,
5035                                    sizeof(compat_log)))
5036                         return -EFAULT;
5037                 log.slot         = compat_log.slot;
5038                 log.padding1     = compat_log.padding1;
5039                 log.padding2     = compat_log.padding2;
5040                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5041
5042                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5043                 break;
5044         }
5045         default:
5046                 r = kvm_vm_ioctl(filp, ioctl, arg);
5047         }
5048         return r;
5049 }
5050 #endif
5051
5052 static const struct file_operations kvm_vm_fops = {
5053         .release        = kvm_vm_release,
5054         .unlocked_ioctl = kvm_vm_ioctl,
5055         .llseek         = noop_llseek,
5056         KVM_COMPAT(kvm_vm_compat_ioctl),
5057 };
5058
5059 bool file_is_kvm(struct file *file)
5060 {
5061         return file && file->f_op == &kvm_vm_fops;
5062 }
5063 EXPORT_SYMBOL_GPL(file_is_kvm);
5064
5065 static int kvm_dev_ioctl_create_vm(unsigned long type)
5066 {
5067         char fdname[ITOA_MAX_LEN + 1];
5068         int r, fd;
5069         struct kvm *kvm;
5070         struct file *file;
5071
5072         fd = get_unused_fd_flags(O_CLOEXEC);
5073         if (fd < 0)
5074                 return fd;
5075
5076         snprintf(fdname, sizeof(fdname), "%d", fd);
5077
5078         kvm = kvm_create_vm(type, fdname);
5079         if (IS_ERR(kvm)) {
5080                 r = PTR_ERR(kvm);
5081                 goto put_fd;
5082         }
5083
5084         file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5085         if (IS_ERR(file)) {
5086                 r = PTR_ERR(file);
5087                 goto put_kvm;
5088         }
5089
5090         /*
5091          * Don't call kvm_put_kvm anymore at this point; file->f_op is
5092          * already set, with ->release() being kvm_vm_release().  In error
5093          * cases it will be called by the final fput(file) and will take
5094          * care of doing kvm_put_kvm(kvm).
5095          */
5096         kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
5097
5098         fd_install(fd, file);
5099         return fd;
5100
5101 put_kvm:
5102         kvm_put_kvm(kvm);
5103 put_fd:
5104         put_unused_fd(fd);
5105         return r;
5106 }
5107
5108 static long kvm_dev_ioctl(struct file *filp,
5109                           unsigned int ioctl, unsigned long arg)
5110 {
5111         int r = -EINVAL;
5112
5113         switch (ioctl) {
5114         case KVM_GET_API_VERSION:
5115                 if (arg)
5116                         goto out;
5117                 r = KVM_API_VERSION;
5118                 break;
5119         case KVM_CREATE_VM:
5120                 r = kvm_dev_ioctl_create_vm(arg);
5121                 break;
5122         case KVM_CHECK_EXTENSION:
5123                 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5124                 break;
5125         case KVM_GET_VCPU_MMAP_SIZE:
5126                 if (arg)
5127                         goto out;
5128                 r = PAGE_SIZE;     /* struct kvm_run */
5129 #ifdef CONFIG_X86
5130                 r += PAGE_SIZE;    /* pio data page */
5131 #endif
5132 #ifdef CONFIG_KVM_MMIO
5133                 r += PAGE_SIZE;    /* coalesced mmio ring page */
5134 #endif
5135                 break;
5136         case KVM_TRACE_ENABLE:
5137         case KVM_TRACE_PAUSE:
5138         case KVM_TRACE_DISABLE:
5139                 r = -EOPNOTSUPP;
5140                 break;
5141         default:
5142                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
5143         }
5144 out:
5145         return r;
5146 }
5147
5148 static struct file_operations kvm_chardev_ops = {
5149         .unlocked_ioctl = kvm_dev_ioctl,
5150         .llseek         = noop_llseek,
5151         KVM_COMPAT(kvm_dev_ioctl),
5152 };
5153
5154 static struct miscdevice kvm_dev = {
5155         KVM_MINOR,
5156         "kvm",
5157         &kvm_chardev_ops,
5158 };
5159
5160 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5161 __visible bool kvm_rebooting;
5162 EXPORT_SYMBOL_GPL(kvm_rebooting);
5163
5164 static DEFINE_PER_CPU(bool, hardware_enabled);
5165 static int kvm_usage_count;
5166
5167 static int __hardware_enable_nolock(void)
5168 {
5169         if (__this_cpu_read(hardware_enabled))
5170                 return 0;
5171
5172         if (kvm_arch_hardware_enable()) {
5173                 pr_info("kvm: enabling virtualization on CPU%d failed\n",
5174                         raw_smp_processor_id());
5175                 return -EIO;
5176         }
5177
5178         __this_cpu_write(hardware_enabled, true);
5179         return 0;
5180 }
5181
5182 static void hardware_enable_nolock(void *failed)
5183 {
5184         if (__hardware_enable_nolock())
5185                 atomic_inc(failed);
5186 }
5187
5188 static int kvm_online_cpu(unsigned int cpu)
5189 {
5190         int ret = 0;
5191
5192         /*
5193          * Abort the CPU online process if hardware virtualization cannot
5194          * be enabled. Otherwise running VMs would encounter unrecoverable
5195          * errors when scheduled to this CPU.
5196          */
5197         mutex_lock(&kvm_lock);
5198         if (kvm_usage_count)
5199                 ret = __hardware_enable_nolock();
5200         mutex_unlock(&kvm_lock);
5201         return ret;
5202 }
5203
5204 static void hardware_disable_nolock(void *junk)
5205 {
5206         /*
5207          * Note, hardware_disable_all_nolock() tells all online CPUs to disable
5208          * hardware, not just CPUs that successfully enabled hardware!
5209          */
5210         if (!__this_cpu_read(hardware_enabled))
5211                 return;
5212
5213         kvm_arch_hardware_disable();
5214
5215         __this_cpu_write(hardware_enabled, false);
5216 }
5217
5218 static int kvm_offline_cpu(unsigned int cpu)
5219 {
5220         mutex_lock(&kvm_lock);
5221         if (kvm_usage_count)
5222                 hardware_disable_nolock(NULL);
5223         mutex_unlock(&kvm_lock);
5224         return 0;
5225 }
5226
5227 static void hardware_disable_all_nolock(void)
5228 {
5229         BUG_ON(!kvm_usage_count);
5230
5231         kvm_usage_count--;
5232         if (!kvm_usage_count)
5233                 on_each_cpu(hardware_disable_nolock, NULL, 1);
5234 }
5235
5236 static void hardware_disable_all(void)
5237 {
5238         cpus_read_lock();
5239         mutex_lock(&kvm_lock);
5240         hardware_disable_all_nolock();
5241         mutex_unlock(&kvm_lock);
5242         cpus_read_unlock();
5243 }
5244
5245 static int hardware_enable_all(void)
5246 {
5247         atomic_t failed = ATOMIC_INIT(0);
5248         int r;
5249
5250         /*
5251          * Do not enable hardware virtualization if the system is going down.
5252          * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5253          * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5254          * after kvm_reboot() is called.  Note, this relies on system_state
5255          * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5256          * hook instead of registering a dedicated reboot notifier (the latter
5257          * runs before system_state is updated).
5258          */
5259         if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5260             system_state == SYSTEM_RESTART)
5261                 return -EBUSY;
5262
5263         /*
5264          * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5265          * is called, and so on_each_cpu() between them includes the CPU that
5266          * is being onlined.  As a result, hardware_enable_nolock() may get
5267          * invoked before kvm_online_cpu(), which also enables hardware if the
5268          * usage count is non-zero.  Disable CPU hotplug to avoid attempting to
5269          * enable hardware multiple times.
5270          */
5271         cpus_read_lock();
5272         mutex_lock(&kvm_lock);
5273
5274         r = 0;
5275
5276         kvm_usage_count++;
5277         if (kvm_usage_count == 1) {
5278                 on_each_cpu(hardware_enable_nolock, &failed, 1);
5279
5280                 if (atomic_read(&failed)) {
5281                         hardware_disable_all_nolock();
5282                         r = -EBUSY;
5283                 }
5284         }
5285
5286         mutex_unlock(&kvm_lock);
5287         cpus_read_unlock();
5288
5289         return r;
5290 }
5291
5292 static void kvm_shutdown(void)
5293 {
5294         /*
5295          * Disable hardware virtualization and set kvm_rebooting to indicate
5296          * that KVM has asynchronously disabled hardware virtualization, i.e.
5297          * that relevant errors and exceptions aren't entirely unexpected.
5298          * Some flavors of hardware virtualization need to be disabled before
5299          * transferring control to firmware (to perform shutdown/reboot), e.g.
5300          * on x86, virtualization can block INIT interrupts, which are used by
5301          * firmware to pull APs back under firmware control.  Note, this path
5302          * is used for both shutdown and reboot scenarios, i.e. neither name is
5303          * 100% comprehensive.
5304          */
5305         pr_info("kvm: exiting hardware virtualization\n");
5306         kvm_rebooting = true;
5307         on_each_cpu(hardware_disable_nolock, NULL, 1);
5308 }
5309
5310 static int kvm_suspend(void)
5311 {
5312         /*
5313          * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5314          * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
5315          * is stable.  Assert that kvm_lock is not held to ensure the system
5316          * isn't suspended while KVM is enabling hardware.  Hardware enabling
5317          * can be preempted, but the task cannot be frozen until it has dropped
5318          * all locks (userspace tasks are frozen via a fake signal).
5319          */
5320         lockdep_assert_not_held(&kvm_lock);
5321         lockdep_assert_irqs_disabled();
5322
5323         if (kvm_usage_count)
5324                 hardware_disable_nolock(NULL);
5325         return 0;
5326 }
5327
5328 static void kvm_resume(void)
5329 {
5330         lockdep_assert_not_held(&kvm_lock);
5331         lockdep_assert_irqs_disabled();
5332
5333         if (kvm_usage_count)
5334                 WARN_ON_ONCE(__hardware_enable_nolock());
5335 }
5336
5337 static struct syscore_ops kvm_syscore_ops = {
5338         .suspend = kvm_suspend,
5339         .resume = kvm_resume,
5340         .shutdown = kvm_shutdown,
5341 };
5342 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5343 static int hardware_enable_all(void)
5344 {
5345         return 0;
5346 }
5347
5348 static void hardware_disable_all(void)
5349 {
5350
5351 }
5352 #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5353
5354 static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5355 {
5356         if (dev->ops->destructor)
5357                 dev->ops->destructor(dev);
5358 }
5359
5360 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
5361 {
5362         int i;
5363
5364         for (i = 0; i < bus->dev_count; i++) {
5365                 struct kvm_io_device *pos = bus->range[i].dev;
5366
5367                 kvm_iodevice_destructor(pos);
5368         }
5369         kfree(bus);
5370 }
5371
5372 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5373                                  const struct kvm_io_range *r2)
5374 {
5375         gpa_t addr1 = r1->addr;
5376         gpa_t addr2 = r2->addr;
5377
5378         if (addr1 < addr2)
5379                 return -1;
5380
5381         /* If r2->len == 0, match the exact address.  If r2->len != 0,
5382          * accept any overlapping write.  Any order is acceptable for
5383          * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5384          * we process all of them.
5385          */
5386         if (r2->len) {
5387                 addr1 += r1->len;
5388                 addr2 += r2->len;
5389         }
5390
5391         if (addr1 > addr2)
5392                 return 1;
5393
5394         return 0;
5395 }
5396
5397 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5398 {
5399         return kvm_io_bus_cmp(p1, p2);
5400 }
5401
5402 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5403                              gpa_t addr, int len)
5404 {
5405         struct kvm_io_range *range, key;
5406         int off;
5407
5408         key = (struct kvm_io_range) {
5409                 .addr = addr,
5410                 .len = len,
5411         };
5412
5413         range = bsearch(&key, bus->range, bus->dev_count,
5414                         sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5415         if (range == NULL)
5416                 return -ENOENT;
5417
5418         off = range - bus->range;
5419
5420         while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5421                 off--;
5422
5423         return off;
5424 }
5425
5426 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5427                               struct kvm_io_range *range, const void *val)
5428 {
5429         int idx;
5430
5431         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5432         if (idx < 0)
5433                 return -EOPNOTSUPP;
5434
5435         while (idx < bus->dev_count &&
5436                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5437                 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5438                                         range->len, val))
5439                         return idx;
5440                 idx++;
5441         }
5442
5443         return -EOPNOTSUPP;
5444 }
5445
5446 /* kvm_io_bus_write - called under kvm->slots_lock */
5447 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5448                      int len, const void *val)
5449 {
5450         struct kvm_io_bus *bus;
5451         struct kvm_io_range range;
5452         int r;
5453
5454         range = (struct kvm_io_range) {
5455                 .addr = addr,
5456                 .len = len,
5457         };
5458
5459         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5460         if (!bus)
5461                 return -ENOMEM;
5462         r = __kvm_io_bus_write(vcpu, bus, &range, val);
5463         return r < 0 ? r : 0;
5464 }
5465 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5466
5467 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
5468 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5469                             gpa_t addr, int len, const void *val, long cookie)
5470 {
5471         struct kvm_io_bus *bus;
5472         struct kvm_io_range range;
5473
5474         range = (struct kvm_io_range) {
5475                 .addr = addr,
5476                 .len = len,
5477         };
5478
5479         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5480         if (!bus)
5481                 return -ENOMEM;
5482
5483         /* First try the device referenced by cookie. */
5484         if ((cookie >= 0) && (cookie < bus->dev_count) &&
5485             (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5486                 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5487                                         val))
5488                         return cookie;
5489
5490         /*
5491          * cookie contained garbage; fall back to search and return the
5492          * correct cookie value.
5493          */
5494         return __kvm_io_bus_write(vcpu, bus, &range, val);
5495 }
5496
5497 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5498                              struct kvm_io_range *range, void *val)
5499 {
5500         int idx;
5501
5502         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5503         if (idx < 0)
5504                 return -EOPNOTSUPP;
5505
5506         while (idx < bus->dev_count &&
5507                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5508                 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5509                                        range->len, val))
5510                         return idx;
5511                 idx++;
5512         }
5513
5514         return -EOPNOTSUPP;
5515 }
5516
5517 /* kvm_io_bus_read - called under kvm->slots_lock */
5518 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5519                     int len, void *val)
5520 {
5521         struct kvm_io_bus *bus;
5522         struct kvm_io_range range;
5523         int r;
5524
5525         range = (struct kvm_io_range) {
5526                 .addr = addr,
5527                 .len = len,
5528         };
5529
5530         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5531         if (!bus)
5532                 return -ENOMEM;
5533         r = __kvm_io_bus_read(vcpu, bus, &range, val);
5534         return r < 0 ? r : 0;
5535 }
5536
5537 /* Caller must hold slots_lock. */
5538 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5539                             int len, struct kvm_io_device *dev)
5540 {
5541         int i;
5542         struct kvm_io_bus *new_bus, *bus;
5543         struct kvm_io_range range;
5544
5545         bus = kvm_get_bus(kvm, bus_idx);
5546         if (!bus)
5547                 return -ENOMEM;
5548
5549         /* exclude ioeventfd which is limited by maximum fd */
5550         if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5551                 return -ENOSPC;
5552
5553         new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5554                           GFP_KERNEL_ACCOUNT);
5555         if (!new_bus)
5556                 return -ENOMEM;
5557
5558         range = (struct kvm_io_range) {
5559                 .addr = addr,
5560                 .len = len,
5561                 .dev = dev,
5562         };
5563
5564         for (i = 0; i < bus->dev_count; i++)
5565                 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5566                         break;
5567
5568         memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5569         new_bus->dev_count++;
5570         new_bus->range[i] = range;
5571         memcpy(new_bus->range + i + 1, bus->range + i,
5572                 (bus->dev_count - i) * sizeof(struct kvm_io_range));
5573         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5574         synchronize_srcu_expedited(&kvm->srcu);
5575         kfree(bus);
5576
5577         return 0;
5578 }
5579
5580 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5581                               struct kvm_io_device *dev)
5582 {
5583         int i;
5584         struct kvm_io_bus *new_bus, *bus;
5585
5586         lockdep_assert_held(&kvm->slots_lock);
5587
5588         bus = kvm_get_bus(kvm, bus_idx);
5589         if (!bus)
5590                 return 0;
5591
5592         for (i = 0; i < bus->dev_count; i++) {
5593                 if (bus->range[i].dev == dev) {
5594                         break;
5595                 }
5596         }
5597
5598         if (i == bus->dev_count)
5599                 return 0;
5600
5601         new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5602                           GFP_KERNEL_ACCOUNT);
5603         if (new_bus) {
5604                 memcpy(new_bus, bus, struct_size(bus, range, i));
5605                 new_bus->dev_count--;
5606                 memcpy(new_bus->range + i, bus->range + i + 1,
5607                                 flex_array_size(new_bus, range, new_bus->dev_count - i));
5608         }
5609
5610         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5611         synchronize_srcu_expedited(&kvm->srcu);
5612
5613         /*
5614          * If NULL bus is installed, destroy the old bus, including all the
5615          * attached devices. Otherwise, destroy the caller's device only.
5616          */
5617         if (!new_bus) {
5618                 pr_err("kvm: failed to shrink bus, removing it completely\n");
5619                 kvm_io_bus_destroy(bus);
5620                 return -ENOMEM;
5621         }
5622
5623         kvm_iodevice_destructor(dev);
5624         kfree(bus);
5625         return 0;
5626 }
5627
5628 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5629                                          gpa_t addr)
5630 {
5631         struct kvm_io_bus *bus;
5632         int dev_idx, srcu_idx;
5633         struct kvm_io_device *iodev = NULL;
5634
5635         srcu_idx = srcu_read_lock(&kvm->srcu);
5636
5637         bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5638         if (!bus)
5639                 goto out_unlock;
5640
5641         dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5642         if (dev_idx < 0)
5643                 goto out_unlock;
5644
5645         iodev = bus->range[dev_idx].dev;
5646
5647 out_unlock:
5648         srcu_read_unlock(&kvm->srcu, srcu_idx);
5649
5650         return iodev;
5651 }
5652 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5653
5654 static int kvm_debugfs_open(struct inode *inode, struct file *file,
5655                            int (*get)(void *, u64 *), int (*set)(void *, u64),
5656                            const char *fmt)
5657 {
5658         int ret;
5659         struct kvm_stat_data *stat_data = inode->i_private;
5660
5661         /*
5662          * The debugfs files are a reference to the kvm struct which
5663         * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5664         * avoids the race between open and the removal of the debugfs directory.
5665          */
5666         if (!kvm_get_kvm_safe(stat_data->kvm))
5667                 return -ENOENT;
5668
5669         ret = simple_attr_open(inode, file, get,
5670                                kvm_stats_debugfs_mode(stat_data->desc) & 0222
5671                                ? set : NULL, fmt);
5672         if (ret)
5673                 kvm_put_kvm(stat_data->kvm);
5674
5675         return ret;
5676 }
5677
5678 static int kvm_debugfs_release(struct inode *inode, struct file *file)
5679 {
5680         struct kvm_stat_data *stat_data = inode->i_private;
5681
5682         simple_attr_release(inode, file);
5683         kvm_put_kvm(stat_data->kvm);
5684
5685         return 0;
5686 }
5687
5688 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5689 {
5690         *val = *(u64 *)((void *)(&kvm->stat) + offset);
5691
5692         return 0;
5693 }
5694
5695 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5696 {
5697         *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5698
5699         return 0;
5700 }
5701
5702 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5703 {
5704         unsigned long i;
5705         struct kvm_vcpu *vcpu;
5706
5707         *val = 0;
5708
5709         kvm_for_each_vcpu(i, vcpu, kvm)
5710                 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5711
5712         return 0;
5713 }
5714
5715 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5716 {
5717         unsigned long i;
5718         struct kvm_vcpu *vcpu;
5719
5720         kvm_for_each_vcpu(i, vcpu, kvm)
5721                 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5722
5723         return 0;
5724 }
5725
5726 static int kvm_stat_data_get(void *data, u64 *val)
5727 {
5728         int r = -EFAULT;
5729         struct kvm_stat_data *stat_data = data;
5730
5731         switch (stat_data->kind) {
5732         case KVM_STAT_VM:
5733                 r = kvm_get_stat_per_vm(stat_data->kvm,
5734                                         stat_data->desc->desc.offset, val);
5735                 break;
5736         case KVM_STAT_VCPU:
5737                 r = kvm_get_stat_per_vcpu(stat_data->kvm,
5738                                           stat_data->desc->desc.offset, val);
5739                 break;
5740         }
5741
5742         return r;
5743 }
5744
5745 static int kvm_stat_data_clear(void *data, u64 val)
5746 {
5747         int r = -EFAULT;
5748         struct kvm_stat_data *stat_data = data;
5749
5750         if (val)
5751                 return -EINVAL;
5752
5753         switch (stat_data->kind) {
5754         case KVM_STAT_VM:
5755                 r = kvm_clear_stat_per_vm(stat_data->kvm,
5756                                           stat_data->desc->desc.offset);
5757                 break;
5758         case KVM_STAT_VCPU:
5759                 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5760                                             stat_data->desc->desc.offset);
5761                 break;
5762         }
5763
5764         return r;
5765 }
5766
5767 static int kvm_stat_data_open(struct inode *inode, struct file *file)
5768 {
5769         __simple_attr_check_format("%llu\n", 0ull);
5770         return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5771                                 kvm_stat_data_clear, "%llu\n");
5772 }
5773
5774 static const struct file_operations stat_fops_per_vm = {
5775         .owner = THIS_MODULE,
5776         .open = kvm_stat_data_open,
5777         .release = kvm_debugfs_release,
5778         .read = simple_attr_read,
5779         .write = simple_attr_write,
5780         .llseek = no_llseek,
5781 };
5782
5783 static int vm_stat_get(void *_offset, u64 *val)
5784 {
5785         unsigned offset = (long)_offset;
5786         struct kvm *kvm;
5787         u64 tmp_val;
5788
5789         *val = 0;
5790         mutex_lock(&kvm_lock);
5791         list_for_each_entry(kvm, &vm_list, vm_list) {
5792                 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5793                 *val += tmp_val;
5794         }
5795         mutex_unlock(&kvm_lock);
5796         return 0;
5797 }
5798
5799 static int vm_stat_clear(void *_offset, u64 val)
5800 {
5801         unsigned offset = (long)_offset;
5802         struct kvm *kvm;
5803
5804         if (val)
5805                 return -EINVAL;
5806
5807         mutex_lock(&kvm_lock);
5808         list_for_each_entry(kvm, &vm_list, vm_list) {
5809                 kvm_clear_stat_per_vm(kvm, offset);
5810         }
5811         mutex_unlock(&kvm_lock);
5812
5813         return 0;
5814 }
5815
5816 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5817 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5818
5819 static int vcpu_stat_get(void *_offset, u64 *val)
5820 {
5821         unsigned offset = (long)_offset;
5822         struct kvm *kvm;
5823         u64 tmp_val;
5824
5825         *val = 0;
5826         mutex_lock(&kvm_lock);
5827         list_for_each_entry(kvm, &vm_list, vm_list) {
5828                 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5829                 *val += tmp_val;
5830         }
5831         mutex_unlock(&kvm_lock);
5832         return 0;
5833 }
5834
5835 static int vcpu_stat_clear(void *_offset, u64 val)
5836 {
5837         unsigned offset = (long)_offset;
5838         struct kvm *kvm;
5839
5840         if (val)
5841                 return -EINVAL;
5842
5843         mutex_lock(&kvm_lock);
5844         list_for_each_entry(kvm, &vm_list, vm_list) {
5845                 kvm_clear_stat_per_vcpu(kvm, offset);
5846         }
5847         mutex_unlock(&kvm_lock);
5848
5849         return 0;
5850 }
5851
5852 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5853                         "%llu\n");
5854 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5855
5856 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5857 {
5858         struct kobj_uevent_env *env;
5859         unsigned long long created, active;
5860
5861         if (!kvm_dev.this_device || !kvm)
5862                 return;
5863
5864         mutex_lock(&kvm_lock);
5865         if (type == KVM_EVENT_CREATE_VM) {
5866                 kvm_createvm_count++;
5867                 kvm_active_vms++;
5868         } else if (type == KVM_EVENT_DESTROY_VM) {
5869                 kvm_active_vms--;
5870         }
5871         created = kvm_createvm_count;
5872         active = kvm_active_vms;
5873         mutex_unlock(&kvm_lock);
5874
5875         env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5876         if (!env)
5877                 return;
5878
5879         add_uevent_var(env, "CREATED=%llu", created);
5880         add_uevent_var(env, "COUNT=%llu", active);
5881
5882         if (type == KVM_EVENT_CREATE_VM) {
5883                 add_uevent_var(env, "EVENT=create");
5884                 kvm->userspace_pid = task_pid_nr(current);
5885         } else if (type == KVM_EVENT_DESTROY_VM) {
5886                 add_uevent_var(env, "EVENT=destroy");
5887         }
5888         add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5889
5890         if (!IS_ERR(kvm->debugfs_dentry)) {
5891                 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5892
5893                 if (p) {
5894                         tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5895                         if (!IS_ERR(tmp))
5896                                 add_uevent_var(env, "STATS_PATH=%s", tmp);
5897                         kfree(p);
5898                 }
5899         }
5900         /* no need for checks, since we are adding at most only 5 keys */
5901         env->envp[env->envp_idx++] = NULL;
5902         kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5903         kfree(env);
5904 }
5905
5906 static void kvm_init_debug(void)
5907 {
5908         const struct file_operations *fops;
5909         const struct _kvm_stats_desc *pdesc;
5910         int i;
5911
5912         kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5913
5914         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5915                 pdesc = &kvm_vm_stats_desc[i];
5916                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5917                         fops = &vm_stat_fops;
5918                 else
5919                         fops = &vm_stat_readonly_fops;
5920                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5921                                 kvm_debugfs_dir,
5922                                 (void *)(long)pdesc->desc.offset, fops);
5923         }
5924
5925         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5926                 pdesc = &kvm_vcpu_stats_desc[i];
5927                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5928                         fops = &vcpu_stat_fops;
5929                 else
5930                         fops = &vcpu_stat_readonly_fops;
5931                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5932                                 kvm_debugfs_dir,
5933                                 (void *)(long)pdesc->desc.offset, fops);
5934         }
5935 }
5936
5937 static inline
5938 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5939 {
5940         return container_of(pn, struct kvm_vcpu, preempt_notifier);
5941 }
5942
5943 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5944 {
5945         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5946
5947         WRITE_ONCE(vcpu->preempted, false);
5948         WRITE_ONCE(vcpu->ready, false);
5949
5950         __this_cpu_write(kvm_running_vcpu, vcpu);
5951         kvm_arch_sched_in(vcpu, cpu);
5952         kvm_arch_vcpu_load(vcpu, cpu);
5953 }
5954
5955 static void kvm_sched_out(struct preempt_notifier *pn,
5956                           struct task_struct *next)
5957 {
5958         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5959
5960         if (current->on_rq) {
5961                 WRITE_ONCE(vcpu->preempted, true);
5962                 WRITE_ONCE(vcpu->ready, true);
5963         }
5964         kvm_arch_vcpu_put(vcpu);
5965         __this_cpu_write(kvm_running_vcpu, NULL);
5966 }
5967
5968 /**
5969  * kvm_get_running_vcpu - get the vcpu running on the current CPU.
5970  *
5971  * We can disable preemption locally around accessing the per-CPU variable,
5972  * and use the resolved vcpu pointer after enabling preemption again,
5973  * because even if the current thread is migrated to another CPU, reading
5974  * the per-CPU value later will give us the same value as we update the
5975  * per-CPU variable in the preempt notifier handlers.
5976  */
5977 struct kvm_vcpu *kvm_get_running_vcpu(void)
5978 {
5979         struct kvm_vcpu *vcpu;
5980
5981         preempt_disable();
5982         vcpu = __this_cpu_read(kvm_running_vcpu);
5983         preempt_enable();
5984
5985         return vcpu;
5986 }
5987 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5988
5989 /**
5990  * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
5991  */
5992 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5993 {
5994         return &kvm_running_vcpu;
5995 }
5996
5997 #ifdef CONFIG_GUEST_PERF_EVENTS
5998 static unsigned int kvm_guest_state(void)
5999 {
6000         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6001         unsigned int state;
6002
6003         if (!kvm_arch_pmi_in_guest(vcpu))
6004                 return 0;
6005
6006         state = PERF_GUEST_ACTIVE;
6007         if (!kvm_arch_vcpu_in_kernel(vcpu))
6008                 state |= PERF_GUEST_USER;
6009
6010         return state;
6011 }
6012
6013 static unsigned long kvm_guest_get_ip(void)
6014 {
6015         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6016
6017         /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6018         if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6019                 return 0;
6020
6021         return kvm_arch_vcpu_get_ip(vcpu);
6022 }
6023
6024 static struct perf_guest_info_callbacks kvm_guest_cbs = {
6025         .state                  = kvm_guest_state,
6026         .get_ip                 = kvm_guest_get_ip,
6027         .handle_intel_pt_intr   = NULL,
6028 };
6029
6030 void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6031 {
6032         kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6033         perf_register_guest_info_callbacks(&kvm_guest_cbs);
6034 }
6035 void kvm_unregister_perf_callbacks(void)
6036 {
6037         perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6038 }
6039 #endif
6040
6041 int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6042 {
6043         int r;
6044         int cpu;
6045
6046 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6047         r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6048                                       kvm_online_cpu, kvm_offline_cpu);
6049         if (r)
6050                 return r;
6051
6052         register_syscore_ops(&kvm_syscore_ops);
6053 #endif
6054
6055         /* A kmem cache lets us meet the alignment requirements of fx_save. */
6056         if (!vcpu_align)
6057                 vcpu_align = __alignof__(struct kvm_vcpu);
6058         kvm_vcpu_cache =
6059                 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6060                                            SLAB_ACCOUNT,
6061                                            offsetof(struct kvm_vcpu, arch),
6062                                            offsetofend(struct kvm_vcpu, stats_id)
6063                                            - offsetof(struct kvm_vcpu, arch),
6064                                            NULL);
6065         if (!kvm_vcpu_cache) {
6066                 r = -ENOMEM;
6067                 goto err_vcpu_cache;
6068         }
6069
6070         for_each_possible_cpu(cpu) {
6071                 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6072                                             GFP_KERNEL, cpu_to_node(cpu))) {
6073                         r = -ENOMEM;
6074                         goto err_cpu_kick_mask;
6075                 }
6076         }
6077
6078         r = kvm_irqfd_init();
6079         if (r)
6080                 goto err_irqfd;
6081
6082         r = kvm_async_pf_init();
6083         if (r)
6084                 goto err_async_pf;
6085
6086         kvm_chardev_ops.owner = module;
6087
6088         kvm_preempt_ops.sched_in = kvm_sched_in;
6089         kvm_preempt_ops.sched_out = kvm_sched_out;
6090
6091         kvm_init_debug();
6092
6093         r = kvm_vfio_ops_init();
6094         if (WARN_ON_ONCE(r))
6095                 goto err_vfio;
6096
6097         /*
6098          * Registration _must_ be the very last thing done, as this exposes
6099          * /dev/kvm to userspace, i.e. all infrastructure must be setup!
6100          */
6101         r = misc_register(&kvm_dev);
6102         if (r) {
6103                 pr_err("kvm: misc device register failed\n");
6104                 goto err_register;
6105         }
6106
6107         return 0;
6108
6109 err_register:
6110         kvm_vfio_ops_exit();
6111 err_vfio:
6112         kvm_async_pf_deinit();
6113 err_async_pf:
6114         kvm_irqfd_exit();
6115 err_irqfd:
6116 err_cpu_kick_mask:
6117         for_each_possible_cpu(cpu)
6118                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6119         kmem_cache_destroy(kvm_vcpu_cache);
6120 err_vcpu_cache:
6121 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6122         unregister_syscore_ops(&kvm_syscore_ops);
6123         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6124 #endif
6125         return r;
6126 }
6127 EXPORT_SYMBOL_GPL(kvm_init);
6128
6129 void kvm_exit(void)
6130 {
6131         int cpu;
6132
6133         /*
6134          * Note, unregistering /dev/kvm doesn't strictly need to come first,
6135          * fops_get(), a.k.a. try_module_get(), prevents acquiring references
6136          * to KVM while the module is being stopped.
6137          */
6138         misc_deregister(&kvm_dev);
6139
6140         debugfs_remove_recursive(kvm_debugfs_dir);
6141         for_each_possible_cpu(cpu)
6142                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6143         kmem_cache_destroy(kvm_vcpu_cache);
6144         kvm_vfio_ops_exit();
6145         kvm_async_pf_deinit();
6146 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6147         unregister_syscore_ops(&kvm_syscore_ops);
6148         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6149 #endif
6150         kvm_irqfd_exit();
6151 }
6152 EXPORT_SYMBOL_GPL(kvm_exit);
6153
6154 struct kvm_vm_worker_thread_context {
6155         struct kvm *kvm;
6156         struct task_struct *parent;
6157         struct completion init_done;
6158         kvm_vm_thread_fn_t thread_fn;
6159         uintptr_t data;
6160         int err;
6161 };
6162
6163 static int kvm_vm_worker_thread(void *context)
6164 {
6165         /*
6166          * The init_context is allocated on the stack of the parent thread, so
6167          * we have to locally copy anything that is needed beyond initialization
6168          */
6169         struct kvm_vm_worker_thread_context *init_context = context;
6170         struct task_struct *parent;
6171         struct kvm *kvm = init_context->kvm;
6172         kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6173         uintptr_t data = init_context->data;
6174         int err;
6175
6176         err = kthread_park(current);
6177         /* kthread_park(current) is never supposed to return an error */
6178         WARN_ON(err != 0);
6179         if (err)
6180                 goto init_complete;
6181
6182         err = cgroup_attach_task_all(init_context->parent, current);
6183         if (err) {
6184                 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6185                         __func__, err);
6186                 goto init_complete;
6187         }
6188
6189         set_user_nice(current, task_nice(init_context->parent));
6190
6191 init_complete:
6192         init_context->err = err;
6193         complete(&init_context->init_done);
6194         init_context = NULL;
6195
6196         if (err)
6197                 goto out;
6198
6199         /* Wait to be woken up by the spawner before proceeding. */
6200         kthread_parkme();
6201
6202         if (!kthread_should_stop())
6203                 err = thread_fn(kvm, data);
6204
6205 out:
6206         /*
6207          * Move kthread back to its original cgroup to prevent it lingering in
6208          * the cgroup of the VM process, after the latter finishes its
6209          * execution.
6210          *
6211          * kthread_stop() waits on the 'exited' completion condition which is
6212          * set in exit_mm(), via mm_release(), in do_exit(). However, the
6213          * kthread is removed from the cgroup in the cgroup_exit() which is
6214          * called after the exit_mm(). This causes the kthread_stop() to return
6215          * before the kthread actually quits the cgroup.
6216          */
6217         rcu_read_lock();
6218         parent = rcu_dereference(current->real_parent);
6219         get_task_struct(parent);
6220         rcu_read_unlock();
6221         cgroup_attach_task_all(parent, current);
6222         put_task_struct(parent);
6223
6224         return err;
6225 }
6226
6227 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6228                                 uintptr_t data, const char *name,
6229                                 struct task_struct **thread_ptr)
6230 {
6231         struct kvm_vm_worker_thread_context init_context = {};
6232         struct task_struct *thread;
6233
6234         *thread_ptr = NULL;
6235         init_context.kvm = kvm;
6236         init_context.parent = current;
6237         init_context.thread_fn = thread_fn;
6238         init_context.data = data;
6239         init_completion(&init_context.init_done);
6240
6241         thread = kthread_run(kvm_vm_worker_thread, &init_context,
6242                              "%s-%d", name, task_pid_nr(current));
6243         if (IS_ERR(thread))
6244                 return PTR_ERR(thread);
6245
6246         /* kthread_run is never supposed to return NULL */
6247         WARN_ON(thread == NULL);
6248
6249         wait_for_completion(&init_context.init_done);
6250
6251         if (!init_context.err)
6252                 *thread_ptr = thread;
6253
6254         return init_context.err;
6255 }