virt/kvm/kvm_main.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * Copyright (C) 2006 Qumranet, Inc.
   9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10  *
  11  * Authors:
  12  *   Avi Kivity   <avi@qumranet.com>
  13  *   Yaniv Kamay  <yaniv@qumranet.com>
  14  */
  15
  16 #include <kvm/iodev.h>
  17
  18 #include <linux/kvm_host.h>
  19 #include <linux/kvm.h>
  20 #include <linux/module.h>
  21 #include <linux/errno.h>
  22 #include <linux/percpu.h>
  23 #include <linux/mm.h>
  24 #include <linux/miscdevice.h>
  25 #include <linux/vmalloc.h>
  26 #include <linux/reboot.h>
  27 #include <linux/debugfs.h>
  28 #include <linux/highmem.h>
  29 #include <linux/file.h>
  30 #include <linux/syscore_ops.h>
  31 #include <linux/cpu.h>
  32 #include <linux/sched/signal.h>
  33 #include <linux/sched/mm.h>
  34 #include <linux/sched/stat.h>
  35 #include <linux/cpumask.h>
  36 #include <linux/smp.h>
  37 #include <linux/anon_inodes.h>
  38 #include <linux/profile.h>
  39 #include <linux/kvm_para.h>
  40 #include <linux/pagemap.h>
  41 #include <linux/mman.h>
  42 #include <linux/swap.h>
  43 #include <linux/bitops.h>
  44 #include <linux/spinlock.h>
  45 #include <linux/compat.h>
  46 #include <linux/srcu.h>
  47 #include <linux/hugetlb.h>
  48 #include <linux/slab.h>
  49 #include <linux/sort.h>
  50 #include <linux/bsearch.h>
  51 #include <linux/io.h>
  52 #include <linux/lockdep.h>
  53 #include <linux/kthread.h>
  54 #include <linux/suspend.h>
  55
  56 #include <asm/processor.h>
  57 #include <asm/ioctl.h>
  58 #include <linux/uaccess.h>
  59
  60 #include "coalesced_mmio.h"
  61 #include "async_pf.h"
  62 #include "mmu_lock.h"
  63 #include "vfio.h"
  64
  65 #define CREATE_TRACE_POINTS
  66 #include <trace/events/kvm.h>
  67
  68 #include <linux/kvm_dirty_ring.h>
  69
  70 /* Worst case buffer size needed for holding an integer. */
  71 #define ITOA_MAX_LEN 12
  72
  73 MODULE_AUTHOR("Qumranet");
  74 MODULE_LICENSE("GPL");
  75
  76 /* Architectures should define their poll value according to the halt latency */
  77 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
  78 module_param(halt_poll_ns, uint, 0644);
  79 EXPORT_SYMBOL_GPL(halt_poll_ns);
  80
  81 /* Default doubles per-vcpu halt_poll_ns. */
  82 unsigned int halt_poll_ns_grow = 2;
  83 module_param(halt_poll_ns_grow, uint, 0644);
  84 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
  85
  86 /* The start value to grow halt_poll_ns from */
  87 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
  88 module_param(halt_poll_ns_grow_start, uint, 0644);
  89 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
  90
  91 /* Default resets per-vcpu halt_poll_ns . */
  92 unsigned int halt_poll_ns_shrink;
  93 module_param(halt_poll_ns_shrink, uint, 0644);
  94 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  95
  96 /*
  97  * Ordering of locks:
  98  *
  99  *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
 100  */
 101
 102 DEFINE_MUTEX(kvm_lock);
 103 static DEFINE_RAW_SPINLOCK(kvm_count_lock);
 104 LIST_HEAD(vm_list);
 105
 106 static cpumask_var_t cpus_hardware_enabled;
 107 static int kvm_usage_count;
 108 static atomic_t hardware_enable_failed;
 109
 110 static struct kmem_cache *kvm_vcpu_cache;
 111
 112 static __read_mostly struct preempt_ops kvm_preempt_ops;
 113 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
 114
 115 struct dentry *kvm_debugfs_dir;
 116 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 117
 118 static const struct file_operations stat_fops_per_vm;
 119
 120 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 121                            unsigned long arg);
 122 #ifdef CONFIG_KVM_COMPAT
 123 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
 124                                   unsigned long arg);
 125 #define KVM_COMPAT(c)   .compat_ioctl   = (c)
 126 #else
 127 /*
 128  * For architectures that don't implement a compat infrastructure,
 129  * adopt a double line of defense:
 130  * - Prevent a compat task from opening /dev/kvm
 131  * - If the open has been done by a 64bit task, and the KVM fd
 132  *   passed to a compat task, let the ioctls fail.
 133  */
 134 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
 135                                 unsigned long arg) { return -EINVAL; }
 136
 137 static int kvm_no_compat_open(struct inode *inode, struct file *file)
 138 {
 139         return is_compat_task() ? -ENODEV : 0;
 140 }
 141 #define KVM_COMPAT(c)   .compat_ioctl   = kvm_no_compat_ioctl,  \
 142                         .open           = kvm_no_compat_open
 143 #endif
 144 static int hardware_enable_all(void);
 145 static void hardware_disable_all(void);
 146
 147 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 148
 149 __visible bool kvm_rebooting;
 150 EXPORT_SYMBOL_GPL(kvm_rebooting);
 151
 152 #define KVM_EVENT_CREATE_VM 0
 153 #define KVM_EVENT_DESTROY_VM 1
 154 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 155 static unsigned long long kvm_createvm_count;
 156 static unsigned long long kvm_active_vms;
 157
 158 static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
 159
 160 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 161                                                    unsigned long start, unsigned long end)
 162 {
 163 }
 164
 165 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
 166 {
 167         /*
 168          * The metadata used by is_zone_device_page() to determine whether or
 169          * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
 170          * the device has been pinned, e.g. by get_user_pages().  WARN if the
 171          * page_count() is zero to help detect bad usage of this helper.
 172          */
 173         if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
 174                 return false;
 175
 176         return is_zone_device_page(pfn_to_page(pfn));
 177 }
 178
 179 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 180 {
 181         /*
 182          * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
 183          * perspective they are "normal" pages, albeit with slightly different
 184          * usage rules.
 185          */
 186         if (pfn_valid(pfn))
 187                 return PageReserved(pfn_to_page(pfn)) &&
 188                        !is_zero_pfn(pfn) &&
 189                        !kvm_is_zone_device_pfn(pfn);
 190
 191         return true;
 192 }
 193
 194 /*
 195  * Switches to specified vcpu, until a matching vcpu_put()
 196  */
 197 void vcpu_load(struct kvm_vcpu *vcpu)
 198 {
 199         int cpu = get_cpu();
 200
 201         __this_cpu_write(kvm_running_vcpu, vcpu);
 202         preempt_notifier_register(&vcpu->preempt_notifier);
 203         kvm_arch_vcpu_load(vcpu, cpu);
 204         put_cpu();
 205 }
 206 EXPORT_SYMBOL_GPL(vcpu_load);
 207
 208 void vcpu_put(struct kvm_vcpu *vcpu)
 209 {
 210         preempt_disable();
 211         kvm_arch_vcpu_put(vcpu);
 212         preempt_notifier_unregister(&vcpu->preempt_notifier);
 213         __this_cpu_write(kvm_running_vcpu, NULL);
 214         preempt_enable();
 215 }
 216 EXPORT_SYMBOL_GPL(vcpu_put);
 217
 218 /* TODO: merge with kvm_arch_vcpu_should_kick */
 219 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
 220 {
 221         int mode = kvm_vcpu_exiting_guest_mode(vcpu);
 222
 223         /*
 224          * We need to wait for the VCPU to reenable interrupts and get out of
 225          * READING_SHADOW_PAGE_TABLES mode.
 226          */
 227         if (req & KVM_REQUEST_WAIT)
 228                 return mode != OUTSIDE_GUEST_MODE;
 229
 230         /*
 231          * Need to kick a running VCPU, but otherwise there is nothing to do.
 232          */
 233         return mode == IN_GUEST_MODE;
 234 }
 235
 236 static void ack_flush(void *_completed)
 237 {
 238 }
 239
 240 static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
 241 {
 242         if (cpumask_empty(cpus))
 243                 return false;
 244
 245         smp_call_function_many(cpus, ack_flush, NULL, wait);
 246         return true;
 247 }
 248
 249 static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
 250                                   unsigned int req, struct cpumask *tmp,
 251                                   int current_cpu)
 252 {
 253         int cpu;
 254
 255         kvm_make_request(req, vcpu);
 256
 257         if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
 258                 return;
 259
 260         /*
 261          * Note, the vCPU could get migrated to a different pCPU at any point
 262          * after kvm_request_needs_ipi(), which could result in sending an IPI
 263          * to the previous pCPU.  But, that's OK because the purpose of the IPI
 264          * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
 265          * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
 266          * after this point is also OK, as the requirement is only that KVM wait
 267          * for vCPUs that were reading SPTEs _before_ any changes were
 268          * finalized. See kvm_vcpu_kick() for more details on handling requests.
 269          */
 270         if (kvm_request_needs_ipi(vcpu, req)) {
 271                 cpu = READ_ONCE(vcpu->cpu);
 272                 if (cpu != -1 && cpu != current_cpu)
 273                         __cpumask_set_cpu(cpu, tmp);
 274         }
 275 }
 276
 277 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 278                                  unsigned long *vcpu_bitmap)
 279 {
 280         struct kvm_vcpu *vcpu;
 281         struct cpumask *cpus;
 282         int i, me;
 283         bool called;
 284
 285         me = get_cpu();
 286
 287         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 288         cpumask_clear(cpus);
 289
 290         for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
 291                 vcpu = kvm_get_vcpu(kvm, i);
 292                 if (!vcpu)
 293                         continue;
 294                 kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
 295         }
 296
 297         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 298         put_cpu();
 299
 300         return called;
 301 }
 302
 303 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 304                                       struct kvm_vcpu *except)
 305 {
 306         struct kvm_vcpu *vcpu;
 307         struct cpumask *cpus;
 308         unsigned long i;
 309         bool called;
 310         int me;
 311
 312         me = get_cpu();
 313
 314         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 315         cpumask_clear(cpus);
 316
 317         kvm_for_each_vcpu(i, vcpu, kvm) {
 318                 if (vcpu == except)
 319                         continue;
 320                 kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
 321         }
 322
 323         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 324         put_cpu();
 325
 326         return called;
 327 }
 328
 329 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 330 {
 331         return kvm_make_all_cpus_request_except(kvm, req, NULL);
 332 }
 333 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
 334
 335 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
 336 void kvm_flush_remote_tlbs(struct kvm *kvm)
 337 {
 338         ++kvm->stat.generic.remote_tlb_flush_requests;
 339
 340         /*
 341          * We want to publish modifications to the page tables before reading
 342          * mode. Pairs with a memory barrier in arch-specific code.
 343          * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
 344          * and smp_mb in walk_shadow_page_lockless_begin/end.
 345          * - powerpc: smp_mb in kvmppc_prepare_to_enter.
 346          *
 347          * There is already an smp_mb__after_atomic() before
 348          * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
 349          * barrier here.
 350          */
 351         if (!kvm_arch_flush_remote_tlb(kvm)
 352             || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 353                 ++kvm->stat.generic.remote_tlb_flush;
 354 }
 355 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 356 #endif
 357
 358 void kvm_reload_remote_mmus(struct kvm *kvm)
 359 {
 360         kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 361 }
 362
 363 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 364 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 365                                                gfp_t gfp_flags)
 366 {
 367         gfp_flags |= mc->gfp_zero;
 368
 369         if (mc->kmem_cache)
 370                 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
 371         else
 372                 return (void *)__get_free_page(gfp_flags);
 373 }
 374
 375 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 376 {
 377         void *obj;
 378
 379         if (mc->nobjs >= min)
 380                 return 0;
 381         while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
 382                 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
 383                 if (!obj)
 384                         return mc->nobjs >= min ? 0 : -ENOMEM;
 385                 mc->objects[mc->nobjs++] = obj;
 386         }
 387         return 0;
 388 }
 389
 390 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 391 {
 392         return mc->nobjs;
 393 }
 394
 395 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 396 {
 397         while (mc->nobjs) {
 398                 if (mc->kmem_cache)
 399                         kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
 400                 else
 401                         free_page((unsigned long)mc->objects[--mc->nobjs]);
 402         }
 403 }
 404
 405 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 406 {
 407         void *p;
 408
 409         if (WARN_ON(!mc->nobjs))
 410                 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
 411         else
 412                 p = mc->objects[--mc->nobjs];
 413         BUG_ON(!p);
 414         return p;
 415 }
 416 #endif
 417
 418 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 419 {
 420         mutex_init(&vcpu->mutex);
 421         vcpu->cpu = -1;
 422         vcpu->kvm = kvm;
 423         vcpu->vcpu_id = id;
 424         vcpu->pid = NULL;
 425         rcuwait_init(&vcpu->wait);
 426         kvm_async_pf_vcpu_init(vcpu);
 427
 428         vcpu->pre_pcpu = -1;
 429         INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
 430
 431         kvm_vcpu_set_in_spin_loop(vcpu, false);
 432         kvm_vcpu_set_dy_eligible(vcpu, false);
 433         vcpu->preempted = false;
 434         vcpu->ready = false;
 435         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 436         vcpu->last_used_slot = 0;
 437 }
 438
 439 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 440 {
 441         kvm_dirty_ring_free(&vcpu->dirty_ring);
 442         kvm_arch_vcpu_destroy(vcpu);
 443
 444         /*
 445          * No need for rcu_read_lock as VCPU_RUN is the only place that changes
 446          * the vcpu->pid pointer, and at destruction time all file descriptors
 447          * are already gone.
 448          */
 449         put_pid(rcu_dereference_protected(vcpu->pid, 1));
 450
 451         free_page((unsigned long)vcpu->run);
 452         kmem_cache_free(kvm_vcpu_cache, vcpu);
 453 }
 454
 455 void kvm_destroy_vcpus(struct kvm *kvm)
 456 {
 457         unsigned long i;
 458         struct kvm_vcpu *vcpu;
 459
 460         kvm_for_each_vcpu(i, vcpu, kvm) {
 461                 kvm_vcpu_destroy(vcpu);
 462                 xa_erase(&kvm->vcpu_array, i);
 463         }
 464
 465         atomic_set(&kvm->online_vcpus, 0);
 466 }
 467 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
 468
 469 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 470 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 471 {
 472         return container_of(mn, struct kvm, mmu_notifier);
 473 }
 474
 475 static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
 476                                               struct mm_struct *mm,
 477                                               unsigned long start, unsigned long end)
 478 {
 479         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 480         int idx;
 481
 482         idx = srcu_read_lock(&kvm->srcu);
 483         kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
 484         srcu_read_unlock(&kvm->srcu, idx);
 485 }
 486
 487 typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 488
 489 typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
 490                              unsigned long end);
 491
 492 struct kvm_hva_range {
 493         unsigned long start;
 494         unsigned long end;
 495         pte_t pte;
 496         hva_handler_t handler;
 497         on_lock_fn_t on_lock;
 498         bool flush_on_ret;
 499         bool may_block;
 500 };
 501
 502 /*
 503  * Use a dedicated stub instead of NULL to indicate that there is no callback
 504  * function/handler.  The compiler technically can't guarantee that a real
 505  * function will have a non-zero address, and so it will generate code to
 506  * check for !NULL, whereas comparing against a stub will be elided at compile
 507  * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
 508  */
 509 static void kvm_null_fn(void)
 510 {
 511
 512 }
 513 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 514
 515 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 516                                                   const struct kvm_hva_range *range)
 517 {
 518         bool ret = false, locked = false;
 519         struct kvm_gfn_range gfn_range;
 520         struct kvm_memory_slot *slot;
 521         struct kvm_memslots *slots;
 522         int i, idx;
 523
 524         /* A null handler is allowed if and only if on_lock() is provided. */
 525         if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
 526                          IS_KVM_NULL_FN(range->handler)))
 527                 return 0;
 528
 529         idx = srcu_read_lock(&kvm->srcu);
 530
 531         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 532                 slots = __kvm_memslots(kvm, i);
 533                 kvm_for_each_memslot(slot, slots) {
 534                         unsigned long hva_start, hva_end;
 535
 536                         hva_start = max(range->start, slot->userspace_addr);
 537                         hva_end = min(range->end, slot->userspace_addr +
 538                                                   (slot->npages << PAGE_SHIFT));
 539                         if (hva_start >= hva_end)
 540                                 continue;
 541
 542                         /*
 543                          * To optimize for the likely case where the address
 544                          * range is covered by zero or one memslots, don't
 545                          * bother making these conditional (to avoid writes on
 546                          * the second or later invocation of the handler).
 547                          */
 548                         gfn_range.pte = range->pte;
 549                         gfn_range.may_block = range->may_block;
 550
 551                         /*
 552                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 553                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 554                          */
 555                         gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
 556                         gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 557                         gfn_range.slot = slot;
 558
 559                         if (!locked) {
 560                                 locked = true;
 561                                 KVM_MMU_LOCK(kvm);
 562                                 if (!IS_KVM_NULL_FN(range->on_lock))
 563                                         range->on_lock(kvm, range->start, range->end);
 564                                 if (IS_KVM_NULL_FN(range->handler))
 565                                         break;
 566                         }
 567                         ret |= range->handler(kvm, &gfn_range);
 568                 }
 569         }
 570
 571         if (range->flush_on_ret && ret)
 572                 kvm_flush_remote_tlbs(kvm);
 573
 574         if (locked)
 575                 KVM_MMU_UNLOCK(kvm);
 576
 577         srcu_read_unlock(&kvm->srcu, idx);
 578
 579         /* The notifiers are averse to booleans. :-( */
 580         return (int)ret;
 581 }
 582
 583 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 584                                                 unsigned long start,
 585                                                 unsigned long end,
 586                                                 pte_t pte,
 587                                                 hva_handler_t handler)
 588 {
 589         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 590         const struct kvm_hva_range range = {
 591                 .start          = start,
 592                 .end            = end,
 593                 .pte            = pte,
 594                 .handler        = handler,
 595                 .on_lock        = (void *)kvm_null_fn,
 596                 .flush_on_ret   = true,
 597                 .may_block      = false,
 598         };
 599
 600         return __kvm_handle_hva_range(kvm, &range);
 601 }
 602
 603 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
 604                                                          unsigned long start,
 605                                                          unsigned long end,
 606                                                          hva_handler_t handler)
 607 {
 608         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 609         const struct kvm_hva_range range = {
 610                 .start          = start,
 611                 .end            = end,
 612                 .pte            = __pte(0),
 613                 .handler        = handler,
 614                 .on_lock        = (void *)kvm_null_fn,
 615                 .flush_on_ret   = false,
 616                 .may_block      = false,
 617         };
 618
 619         return __kvm_handle_hva_range(kvm, &range);
 620 }
 621 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 622                                         struct mm_struct *mm,
 623                                         unsigned long address,
 624                                         pte_t pte)
 625 {
 626         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 627
 628         trace_kvm_set_spte_hva(address);
 629
 630         /*
 631          * .change_pte() must be surrounded by .invalidate_range_{start,end}().
 632          * If mmu_notifier_count is zero, then no in-progress invalidations,
 633          * including this one, found a relevant memslot at start(); rechecking
 634          * memslots here is unnecessary.  Note, a false positive (count elevated
 635          * by a different invalidation) is sub-optimal but functionally ok.
 636          */
 637         WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 638         if (!READ_ONCE(kvm->mmu_notifier_count))
 639                 return;
 640
 641         kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 642 }
 643
 644 void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
 645                                    unsigned long end)
 646 {
 647         /*
 648          * The count increase must become visible at unlock time as no
 649          * spte can be established without taking the mmu_lock and
 650          * count is also read inside the mmu_lock critical section.
 651          */
 652         kvm->mmu_notifier_count++;
 653         if (likely(kvm->mmu_notifier_count == 1)) {
 654                 kvm->mmu_notifier_range_start = start;
 655                 kvm->mmu_notifier_range_end = end;
 656         } else {
 657                 /*
 658                  * Fully tracking multiple concurrent ranges has dimishing
 659                  * returns. Keep things simple and just find the minimal range
 660                  * which includes the current and new ranges. As there won't be
 661                  * enough information to subtract a range after its invalidate
 662                  * completes, any ranges invalidated concurrently will
 663                  * accumulate and persist until all outstanding invalidates
 664                  * complete.
 665                  */
 666                 kvm->mmu_notifier_range_start =
 667                         min(kvm->mmu_notifier_range_start, start);
 668                 kvm->mmu_notifier_range_end =
 669                         max(kvm->mmu_notifier_range_end, end);
 670         }
 671 }
 672
 673 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 674                                         const struct mmu_notifier_range *range)
 675 {
 676         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 677         const struct kvm_hva_range hva_range = {
 678                 .start          = range->start,
 679                 .end            = range->end,
 680                 .pte            = __pte(0),
 681                 .handler        = kvm_unmap_gfn_range,
 682                 .on_lock        = kvm_inc_notifier_count,
 683                 .flush_on_ret   = true,
 684                 .may_block      = mmu_notifier_range_blockable(range),
 685         };
 686
 687         trace_kvm_unmap_hva_range(range->start, range->end);
 688
 689         /*
 690          * Prevent memslot modification between range_start() and range_end()
 691          * so that conditionally locking provides the same result in both
 692          * functions.  Without that guarantee, the mmu_notifier_count
 693          * adjustments will be imbalanced.
 694          *
 695          * Pairs with the decrement in range_end().
 696          */
 697         spin_lock(&kvm->mn_invalidate_lock);
 698         kvm->mn_active_invalidate_count++;
 699         spin_unlock(&kvm->mn_invalidate_lock);
 700
 701         __kvm_handle_hva_range(kvm, &hva_range);
 702
 703         return 0;
 704 }
 705
 706 void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
 707                                    unsigned long end)
 708 {
 709         /*
 710          * This sequence increase will notify the kvm page fault that
 711          * the page that is going to be mapped in the spte could have
 712          * been freed.
 713          */
 714         kvm->mmu_notifier_seq++;
 715         smp_wmb();
 716         /*
 717          * The above sequence increase must be visible before the
 718          * below count decrease, which is ensured by the smp_wmb above
 719          * in conjunction with the smp_rmb in mmu_notifier_retry().
 720          */
 721         kvm->mmu_notifier_count--;
 722 }
 723
 724 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 725                                         const struct mmu_notifier_range *range)
 726 {
 727         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 728         const struct kvm_hva_range hva_range = {
 729                 .start          = range->start,
 730                 .end            = range->end,
 731                 .pte            = __pte(0),
 732                 .handler        = (void *)kvm_null_fn,
 733                 .on_lock        = kvm_dec_notifier_count,
 734                 .flush_on_ret   = false,
 735                 .may_block      = mmu_notifier_range_blockable(range),
 736         };
 737         bool wake;
 738
 739         __kvm_handle_hva_range(kvm, &hva_range);
 740
 741         /* Pairs with the increment in range_start(). */
 742         spin_lock(&kvm->mn_invalidate_lock);
 743         wake = (--kvm->mn_active_invalidate_count == 0);
 744         spin_unlock(&kvm->mn_invalidate_lock);
 745
 746         /*
 747          * There can only be one waiter, since the wait happens under
 748          * slots_lock.
 749          */
 750         if (wake)
 751                 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
 752
 753         BUG_ON(kvm->mmu_notifier_count < 0);
 754 }
 755
 756 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 757                                               struct mm_struct *mm,
 758                                               unsigned long start,
 759                                               unsigned long end)
 760 {
 761         trace_kvm_age_hva(start, end);
 762
 763         return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
 764 }
 765
 766 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 767                                         struct mm_struct *mm,
 768                                         unsigned long start,
 769                                         unsigned long end)
 770 {
 771         trace_kvm_age_hva(start, end);
 772
 773         /*
 774          * Even though we do not flush TLB, this will still adversely
 775          * affect performance on pre-Haswell Intel EPT, where there is
 776          * no EPT Access Bit to clear so that we have to tear down EPT
 777          * tables instead. If we find this unacceptable, we can always
 778          * add a parameter to kvm_age_hva so that it effectively doesn't
 779          * do anything on clear_young.
 780          *
 781          * Also note that currently we never issue secondary TLB flushes
 782          * from clear_young, leaving this job up to the regular system
 783          * cadence. If we find this inaccurate, we might come up with a
 784          * more sophisticated heuristic later.
 785          */
 786         return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 787 }
 788
 789 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 790                                        struct mm_struct *mm,
 791                                        unsigned long address)
 792 {
 793         trace_kvm_test_age_hva(address);
 794
 795         return kvm_handle_hva_range_no_flush(mn, address, address + 1,
 796                                              kvm_test_age_gfn);
 797 }
 798
 799 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 800                                      struct mm_struct *mm)
 801 {
 802         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 803         int idx;
 804
 805         idx = srcu_read_lock(&kvm->srcu);
 806         kvm_arch_flush_shadow_all(kvm);
 807         srcu_read_unlock(&kvm->srcu, idx);
 808 }
 809
 810 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 811         .invalidate_range       = kvm_mmu_notifier_invalidate_range,
 812         .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
 813         .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
 814         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
 815         .clear_young            = kvm_mmu_notifier_clear_young,
 816         .test_young             = kvm_mmu_notifier_test_young,
 817         .change_pte             = kvm_mmu_notifier_change_pte,
 818         .release                = kvm_mmu_notifier_release,
 819 };
 820
 821 static int kvm_init_mmu_notifier(struct kvm *kvm)
 822 {
 823         kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 824         return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 825 }
 826
 827 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 828
 829 static int kvm_init_mmu_notifier(struct kvm *kvm)
 830 {
 831         return 0;
 832 }
 833
 834 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 835
 836 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 837 static int kvm_pm_notifier_call(struct notifier_block *bl,
 838                                 unsigned long state,
 839                                 void *unused)
 840 {
 841         struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
 842
 843         return kvm_arch_pm_notifier(kvm, state);
 844 }
 845
 846 static void kvm_init_pm_notifier(struct kvm *kvm)
 847 {
 848         kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
 849         /* Suspend KVM before we suspend ftrace, RCU, etc. */
 850         kvm->pm_notifier.priority = INT_MAX;
 851         register_pm_notifier(&kvm->pm_notifier);
 852 }
 853
 854 static void kvm_destroy_pm_notifier(struct kvm *kvm)
 855 {
 856         unregister_pm_notifier(&kvm->pm_notifier);
 857 }
 858 #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
 859 static void kvm_init_pm_notifier(struct kvm *kvm)
 860 {
 861 }
 862
 863 static void kvm_destroy_pm_notifier(struct kvm *kvm)
 864 {
 865 }
 866 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
 867
 868 static struct kvm_memslots *kvm_alloc_memslots(void)
 869 {
 870         int i;
 871         struct kvm_memslots *slots;
 872
 873         slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
 874         if (!slots)
 875                 return NULL;
 876
 877         for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
 878                 slots->id_to_index[i] = -1;
 879
 880         return slots;
 881 }
 882
 883 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 884 {
 885         if (!memslot->dirty_bitmap)
 886                 return;
 887
 888         kvfree(memslot->dirty_bitmap);
 889         memslot->dirty_bitmap = NULL;
 890 }
 891
 892 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 893 {
 894         kvm_destroy_dirty_bitmap(slot);
 895
 896         kvm_arch_free_memslot(kvm, slot);
 897
 898         slot->flags = 0;
 899         slot->npages = 0;
 900 }
 901
 902 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
 903 {
 904         struct kvm_memory_slot *memslot;
 905
 906         if (!slots)
 907                 return;
 908
 909         kvm_for_each_memslot(memslot, slots)
 910                 kvm_free_memslot(kvm, memslot);
 911
 912         kvfree(slots);
 913 }
 914
 915 static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
 916 {
 917         switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
 918         case KVM_STATS_TYPE_INSTANT:
 919                 return 0444;
 920         case KVM_STATS_TYPE_CUMULATIVE:
 921         case KVM_STATS_TYPE_PEAK:
 922         default:
 923                 return 0644;
 924         }
 925 }
 926
 927
 928 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
 929 {
 930         int i;
 931         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
 932                                       kvm_vcpu_stats_header.num_desc;
 933
 934         if (!kvm->debugfs_dentry)
 935                 return;
 936
 937         debugfs_remove_recursive(kvm->debugfs_dentry);
 938
 939         if (kvm->debugfs_stat_data) {
 940                 for (i = 0; i < kvm_debugfs_num_entries; i++)
 941                         kfree(kvm->debugfs_stat_data[i]);
 942                 kfree(kvm->debugfs_stat_data);
 943         }
 944 }
 945
 946 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
 947 {
 948         static DEFINE_MUTEX(kvm_debugfs_lock);
 949         struct dentry *dent;
 950         char dir_name[ITOA_MAX_LEN * 2];
 951         struct kvm_stat_data *stat_data;
 952         const struct _kvm_stats_desc *pdesc;
 953         int i, ret;
 954         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
 955                                       kvm_vcpu_stats_header.num_desc;
 956
 957         if (!debugfs_initialized())
 958                 return 0;
 959
 960         snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
 961         mutex_lock(&kvm_debugfs_lock);
 962         dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
 963         if (dent) {
 964                 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
 965                 dput(dent);
 966                 mutex_unlock(&kvm_debugfs_lock);
 967                 return 0;
 968         }
 969         dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
 970         mutex_unlock(&kvm_debugfs_lock);
 971         if (IS_ERR(dent))
 972                 return 0;
 973
 974         kvm->debugfs_dentry = dent;
 975         kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
 976                                          sizeof(*kvm->debugfs_stat_data),
 977                                          GFP_KERNEL_ACCOUNT);
 978         if (!kvm->debugfs_stat_data)
 979                 return -ENOMEM;
 980
 981         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
 982                 pdesc = &kvm_vm_stats_desc[i];
 983                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
 984                 if (!stat_data)
 985                         return -ENOMEM;
 986
 987                 stat_data->kvm = kvm;
 988                 stat_data->desc = pdesc;
 989                 stat_data->kind = KVM_STAT_VM;
 990                 kvm->debugfs_stat_data[i] = stat_data;
 991                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
 992                                     kvm->debugfs_dentry, stat_data,
 993                                     &stat_fops_per_vm);
 994         }
 995
 996         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
 997                 pdesc = &kvm_vcpu_stats_desc[i];
 998                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
 999                 if (!stat_data)
1000                         return -ENOMEM;
1001
1002                 stat_data->kvm = kvm;
1003                 stat_data->desc = pdesc;
1004                 stat_data->kind = KVM_STAT_VCPU;
1005                 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1006                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1007                                     kvm->debugfs_dentry, stat_data,
1008                                     &stat_fops_per_vm);
1009         }
1010
1011         ret = kvm_arch_create_vm_debugfs(kvm);
1012         if (ret) {
1013                 kvm_destroy_vm_debugfs(kvm);
1014                 return i;
1015         }
1016
1017         return 0;
1018 }
1019
1020 /*
1021  * Called after the VM is otherwise initialized, but just before adding it to
1022  * the vm_list.
1023  */
1024 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1025 {
1026         return 0;
1027 }
1028
1029 /*
1030  * Called just after removing the VM from the vm_list, but before doing any
1031  * other destruction.
1032  */
1033 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1034 {
1035 }
1036
1037 /*
1038  * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1039  * be setup already, so we can create arch-specific debugfs entries under it.
1040  * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1041  * a per-arch destroy interface is not needed.
1042  */
1043 int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1044 {
1045         return 0;
1046 }
1047
1048 static struct kvm *kvm_create_vm(unsigned long type)
1049 {
1050         struct kvm *kvm = kvm_arch_alloc_vm();
1051         int r = -ENOMEM;
1052         int i;
1053
1054         if (!kvm)
1055                 return ERR_PTR(-ENOMEM);
1056
1057         KVM_MMU_LOCK_INIT(kvm);
1058         mmgrab(current->mm);
1059         kvm->mm = current->mm;
1060         kvm_eventfd_init(kvm);
1061         mutex_init(&kvm->lock);
1062         mutex_init(&kvm->irq_lock);
1063         mutex_init(&kvm->slots_lock);
1064         mutex_init(&kvm->slots_arch_lock);
1065         spin_lock_init(&kvm->mn_invalidate_lock);
1066         rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1067         xa_init(&kvm->vcpu_array);
1068
1069         INIT_LIST_HEAD(&kvm->devices);
1070
1071         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1072
1073         if (init_srcu_struct(&kvm->srcu))
1074                 goto out_err_no_srcu;
1075         if (init_srcu_struct(&kvm->irq_srcu))
1076                 goto out_err_no_irq_srcu;
1077
1078         refcount_set(&kvm->users_count, 1);
1079         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1080                 struct kvm_memslots *slots = kvm_alloc_memslots();
1081
1082                 if (!slots)
1083                         goto out_err_no_arch_destroy_vm;
1084                 /* Generations must be different for each address space. */
1085                 slots->generation = i;
1086                 rcu_assign_pointer(kvm->memslots[i], slots);
1087         }
1088
1089         for (i = 0; i < KVM_NR_BUSES; i++) {
1090                 rcu_assign_pointer(kvm->buses[i],
1091                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1092                 if (!kvm->buses[i])
1093                         goto out_err_no_arch_destroy_vm;
1094         }
1095
1096         kvm->max_halt_poll_ns = halt_poll_ns;
1097
1098         r = kvm_arch_init_vm(kvm, type);
1099         if (r)
1100                 goto out_err_no_arch_destroy_vm;
1101
1102         r = hardware_enable_all();
1103         if (r)
1104                 goto out_err_no_disable;
1105
1106 #ifdef CONFIG_HAVE_KVM_IRQFD
1107         INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1108 #endif
1109
1110         r = kvm_init_mmu_notifier(kvm);
1111         if (r)
1112                 goto out_err_no_mmu_notifier;
1113
1114         r = kvm_arch_post_init_vm(kvm);
1115         if (r)
1116                 goto out_err;
1117
1118         mutex_lock(&kvm_lock);
1119         list_add(&kvm->vm_list, &vm_list);
1120         mutex_unlock(&kvm_lock);
1121
1122         preempt_notifier_inc();
1123         kvm_init_pm_notifier(kvm);
1124
1125         return kvm;
1126
1127 out_err:
1128 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1129         if (kvm->mmu_notifier.ops)
1130                 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1131 #endif
1132 out_err_no_mmu_notifier:
1133         hardware_disable_all();
1134 out_err_no_disable:
1135         kvm_arch_destroy_vm(kvm);
1136 out_err_no_arch_destroy_vm:
1137         WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1138         for (i = 0; i < KVM_NR_BUSES; i++)
1139                 kfree(kvm_get_bus(kvm, i));
1140         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1141                 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1142         cleanup_srcu_struct(&kvm->irq_srcu);
1143 out_err_no_irq_srcu:
1144         cleanup_srcu_struct(&kvm->srcu);
1145 out_err_no_srcu:
1146         kvm_arch_free_vm(kvm);
1147         mmdrop(current->mm);
1148         return ERR_PTR(r);
1149 }
1150
1151 static void kvm_destroy_devices(struct kvm *kvm)
1152 {
1153         struct kvm_device *dev, *tmp;
1154
1155         /*
1156          * We do not need to take the kvm->lock here, because nobody else
1157          * has a reference to the struct kvm at this point and therefore
1158          * cannot access the devices list anyhow.
1159          */
1160         list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1161                 list_del(&dev->vm_node);
1162                 dev->ops->destroy(dev);
1163         }
1164 }
1165
1166 static void kvm_destroy_vm(struct kvm *kvm)
1167 {
1168         int i;
1169         struct mm_struct *mm = kvm->mm;
1170
1171         kvm_destroy_pm_notifier(kvm);
1172         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1173         kvm_destroy_vm_debugfs(kvm);
1174         kvm_arch_sync_events(kvm);
1175         mutex_lock(&kvm_lock);
1176         list_del(&kvm->vm_list);
1177         mutex_unlock(&kvm_lock);
1178         kvm_arch_pre_destroy_vm(kvm);
1179
1180         kvm_free_irq_routing(kvm);
1181         for (i = 0; i < KVM_NR_BUSES; i++) {
1182                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1183
1184                 if (bus)
1185                         kvm_io_bus_destroy(bus);
1186                 kvm->buses[i] = NULL;
1187         }
1188         kvm_coalesced_mmio_free(kvm);
1189 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1190         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1191         /*
1192          * At this point, pending calls to invalidate_range_start()
1193          * have completed but no more MMU notifiers will run, so
1194          * mn_active_invalidate_count may remain unbalanced.
1195          * No threads can be waiting in install_new_memslots as the
1196          * last reference on KVM has been dropped, but freeing
1197          * memslots would deadlock without this manual intervention.
1198          */
1199         WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1200         kvm->mn_active_invalidate_count = 0;
1201 #else
1202         kvm_arch_flush_shadow_all(kvm);
1203 #endif
1204         kvm_arch_destroy_vm(kvm);
1205         kvm_destroy_devices(kvm);
1206         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1207                 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1208         cleanup_srcu_struct(&kvm->irq_srcu);
1209         cleanup_srcu_struct(&kvm->srcu);
1210         kvm_arch_free_vm(kvm);
1211         preempt_notifier_dec();
1212         hardware_disable_all();
1213         mmdrop(mm);
1214 }
1215
1216 void kvm_get_kvm(struct kvm *kvm)
1217 {
1218         refcount_inc(&kvm->users_count);
1219 }
1220 EXPORT_SYMBOL_GPL(kvm_get_kvm);
1221
1222 /*
1223  * Make sure the vm is not during destruction, which is a safe version of
1224  * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1225  */
1226 bool kvm_get_kvm_safe(struct kvm *kvm)
1227 {
1228         return refcount_inc_not_zero(&kvm->users_count);
1229 }
1230 EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1231
1232 void kvm_put_kvm(struct kvm *kvm)
1233 {
1234         if (refcount_dec_and_test(&kvm->users_count))
1235                 kvm_destroy_vm(kvm);
1236 }
1237 EXPORT_SYMBOL_GPL(kvm_put_kvm);
1238
1239 /*
1240  * Used to put a reference that was taken on behalf of an object associated
1241  * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1242  * of the new file descriptor fails and the reference cannot be transferred to
1243  * its final owner.  In such cases, the caller is still actively using @kvm and
1244  * will fail miserably if the refcount unexpectedly hits zero.
1245  */
1246 void kvm_put_kvm_no_destroy(struct kvm *kvm)
1247 {
1248         WARN_ON(refcount_dec_and_test(&kvm->users_count));
1249 }
1250 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1251
1252 static int kvm_vm_release(struct inode *inode, struct file *filp)
1253 {
1254         struct kvm *kvm = filp->private_data;
1255
1256         kvm_irqfd_release(kvm);
1257
1258         kvm_put_kvm(kvm);
1259         return 0;
1260 }
1261
1262 /*
1263  * Allocation size is twice as large as the actual dirty bitmap size.
1264  * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1265  */
1266 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1267 {
1268         unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
1269
1270         memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
1271         if (!memslot->dirty_bitmap)
1272                 return -ENOMEM;
1273
1274         return 0;
1275 }
1276
1277 /*
1278  * Delete a memslot by decrementing the number of used slots and shifting all
1279  * other entries in the array forward one spot.
1280  */
1281 static inline void kvm_memslot_delete(struct kvm_memslots *slots,
1282                                       struct kvm_memory_slot *memslot)
1283 {
1284         struct kvm_memory_slot *mslots = slots->memslots;
1285         int i;
1286
1287         if (WARN_ON(slots->id_to_index[memslot->id] == -1))
1288                 return;
1289
1290         slots->used_slots--;
1291
1292         if (atomic_read(&slots->last_used_slot) >= slots->used_slots)
1293                 atomic_set(&slots->last_used_slot, 0);
1294
1295         for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
1296                 mslots[i] = mslots[i + 1];
1297                 slots->id_to_index[mslots[i].id] = i;
1298         }
1299         mslots[i] = *memslot;
1300         slots->id_to_index[memslot->id] = -1;
1301 }
1302
1303 /*
1304  * "Insert" a new memslot by incrementing the number of used slots.  Returns
1305  * the new slot's initial index into the memslots array.
1306  */
1307 static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1308 {
1309         return slots->used_slots++;
1310 }
1311
1312 /*
1313  * Move a changed memslot backwards in the array by shifting existing slots
1314  * with a higher GFN toward the front of the array.  Note, the changed memslot
1315  * itself is not preserved in the array, i.e. not swapped at this time, only
1316  * its new index into the array is tracked.  Returns the changed memslot's
1317  * current index into the memslots array.
1318  */
1319 static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1320                                             struct kvm_memory_slot *memslot)
1321 {
1322         struct kvm_memory_slot *mslots = slots->memslots;
1323         int i;
1324
1325         if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1326             WARN_ON_ONCE(!slots->used_slots))
1327                 return -1;
1328
1329         /*
1330          * Move the target memslot backward in the array by shifting existing
1331          * memslots with a higher GFN (than the target memslot) towards the
1332          * front of the array.
1333          */
1334         for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1335                 if (memslot->base_gfn > mslots[i + 1].base_gfn)
1336                         break;
1337
1338                 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1339
1340                 /* Shift the next memslot forward one and update its index. */
1341                 mslots[i] = mslots[i + 1];
1342                 slots->id_to_index[mslots[i].id] = i;
1343         }
1344         return i;
1345 }
1346
1347 /*
1348  * Move a changed memslot forwards in the array by shifting existing slots with
1349  * a lower GFN toward the back of the array.  Note, the changed memslot itself
1350  * is not preserved in the array, i.e. not swapped at this time, only its new
1351  * index into the array is tracked.  Returns the changed memslot's final index
1352  * into the memslots array.
1353  */
1354 static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1355                                            struct kvm_memory_slot *memslot,
1356                                            int start)
1357 {
1358         struct kvm_memory_slot *mslots = slots->memslots;
1359         int i;
1360
1361         for (i = start; i > 0; i--) {
1362                 if (memslot->base_gfn < mslots[i - 1].base_gfn)
1363                         break;
1364
1365                 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1366
1367                 /* Shift the next memslot back one and update its index. */
1368                 mslots[i] = mslots[i - 1];
1369                 slots->id_to_index[mslots[i].id] = i;
1370         }
1371         return i;
1372 }
1373
1374 /*
1375  * Re-sort memslots based on their GFN to account for an added, deleted, or
1376  * moved memslot.  Sorting memslots by GFN allows using a binary search during
1377  * memslot lookup.
1378  *
1379  * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!  I.e. the entry
1380  * at memslots[0] has the highest GFN.
1381  *
1382  * The sorting algorithm takes advantage of having initially sorted memslots
1383  * and knowing the position of the changed memslot.  Sorting is also optimized
1384  * by not swapping the updated memslot and instead only shifting other memslots
1385  * and tracking the new index for the update memslot.  Only once its final
1386  * index is known is the updated memslot copied into its position in the array.
1387  *
1388  *  - When deleting a memslot, the deleted memslot simply needs to be moved to
1389  *    the end of the array.
1390  *
1391  *  - When creating a memslot, the algorithm "inserts" the new memslot at the
1392  *    end of the array and then it forward to its correct location.
1393  *
1394  *  - When moving a memslot, the algorithm first moves the updated memslot
1395  *    backward to handle the scenario where the memslot's GFN was changed to a
1396  *    lower value.  update_memslots() then falls through and runs the same flow
1397  *    as creating a memslot to move the memslot forward to handle the scenario
1398  *    where its GFN was changed to a higher value.
1399  *
1400  * Note, slots are sorted from highest->lowest instead of lowest->highest for
1401  * historical reasons.  Originally, invalid memslots where denoted by having
1402  * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
1403  * to the end of the array.  The current algorithm uses dedicated logic to
1404  * delete a memslot and thus does not rely on invalid memslots having GFN=0.
1405  *
1406  * The other historical motiviation for highest->lowest was to improve the
1407  * performance of memslot lookup.  KVM originally used a linear search starting
1408  * at memslots[0].  On x86, the largest memslot usually has one of the highest,
1409  * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a
1410  * single memslot above the 4gb boundary.  As the largest memslot is also the
1411  * most likely to be referenced, sorting it to the front of the array was
1412  * advantageous.  The current binary search starts from the middle of the array
1413  * and uses an LRU pointer to improve performance for all memslots and GFNs.
1414  */
1415 static void update_memslots(struct kvm_memslots *slots,
1416                             struct kvm_memory_slot *memslot,
1417                             enum kvm_mr_change change)
1418 {
1419         int i;
1420
1421         if (change == KVM_MR_DELETE) {
1422                 kvm_memslot_delete(slots, memslot);
1423         } else {
1424                 if (change == KVM_MR_CREATE)
1425                         i = kvm_memslot_insert_back(slots);
1426                 else
1427                         i = kvm_memslot_move_backward(slots, memslot);
1428                 i = kvm_memslot_move_forward(slots, memslot, i);
1429
1430                 /*
1431                  * Copy the memslot to its new position in memslots and update
1432                  * its index accordingly.
1433                  */
1434                 slots->memslots[i] = *memslot;
1435                 slots->id_to_index[memslot->id] = i;
1436         }
1437 }
1438
1439 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1440 {
1441         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1442
1443 #ifdef __KVM_HAVE_READONLY_MEM
1444         valid_flags |= KVM_MEM_READONLY;
1445 #endif
1446
1447         if (mem->flags & ~valid_flags)
1448                 return -EINVAL;
1449
1450         return 0;
1451 }
1452
1453 static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
1454                 int as_id, struct kvm_memslots *slots)
1455 {
1456         struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
1457         u64 gen = old_memslots->generation;
1458
1459         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1460         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1461
1462         /*
1463          * Do not store the new memslots while there are invalidations in
1464          * progress, otherwise the locking in invalidate_range_start and
1465          * invalidate_range_end will be unbalanced.
1466          */
1467         spin_lock(&kvm->mn_invalidate_lock);
1468         prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1469         while (kvm->mn_active_invalidate_count) {
1470                 set_current_state(TASK_UNINTERRUPTIBLE);
1471                 spin_unlock(&kvm->mn_invalidate_lock);
1472                 schedule();
1473                 spin_lock(&kvm->mn_invalidate_lock);
1474         }
1475         finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1476         rcu_assign_pointer(kvm->memslots[as_id], slots);
1477         spin_unlock(&kvm->mn_invalidate_lock);
1478
1479         /*
1480          * Acquired in kvm_set_memslot. Must be released before synchronize
1481          * SRCU below in order to avoid deadlock with another thread
1482          * acquiring the slots_arch_lock in an srcu critical section.
1483          */
1484         mutex_unlock(&kvm->slots_arch_lock);
1485
1486         synchronize_srcu_expedited(&kvm->srcu);
1487
1488         /*
1489          * Increment the new memslot generation a second time, dropping the
1490          * update in-progress flag and incrementing the generation based on
1491          * the number of address spaces.  This provides a unique and easily
1492          * identifiable generation number while the memslots are in flux.
1493          */
1494         gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1495
1496         /*
1497          * Generations must be unique even across address spaces.  We do not need
1498          * a global counter for that, instead the generation space is evenly split
1499          * across address spaces.  For example, with two address spaces, address
1500          * space 0 will use generations 0, 2, 4, ... while address space 1 will
1501          * use generations 1, 3, 5, ...
1502          */
1503         gen += KVM_ADDRESS_SPACE_NUM;
1504
1505         kvm_arch_memslots_updated(kvm, gen);
1506
1507         slots->generation = gen;
1508
1509         return old_memslots;
1510 }
1511
1512 static size_t kvm_memslots_size(int slots)
1513 {
1514         return sizeof(struct kvm_memslots) +
1515                (sizeof(struct kvm_memory_slot) * slots);
1516 }
1517
1518 /*
1519  * Note, at a minimum, the current number of used slots must be allocated, even
1520  * when deleting a memslot, as we need a complete duplicate of the memslots for
1521  * use when invalidating a memslot prior to deleting/moving the memslot.
1522  */
1523 static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1524                                              enum kvm_mr_change change)
1525 {
1526         struct kvm_memslots *slots;
1527         size_t new_size;
1528
1529         if (change == KVM_MR_CREATE)
1530                 new_size = kvm_memslots_size(old->used_slots + 1);
1531         else
1532                 new_size = kvm_memslots_size(old->used_slots);
1533
1534         slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1535         if (likely(slots))
1536                 memcpy(slots, old, kvm_memslots_size(old->used_slots));
1537
1538         return slots;
1539 }
1540
1541 static void kvm_copy_memslots_arch(struct kvm_memslots *to,
1542                                    struct kvm_memslots *from)
1543 {
1544         int i;
1545
1546         WARN_ON_ONCE(to->used_slots != from->used_slots);
1547
1548         for (i = 0; i < from->used_slots; i++)
1549                 to->memslots[i].arch = from->memslots[i].arch;
1550 }
1551
1552 static int kvm_prepare_memory_region(struct kvm *kvm,
1553                                      const struct kvm_memory_slot *old,
1554                                      struct kvm_memory_slot *new,
1555                                      enum kvm_mr_change change)
1556 {
1557         int r;
1558
1559         /*
1560          * If dirty logging is disabled, nullify the bitmap; the old bitmap
1561          * will be freed on "commit".  If logging is enabled in both old and
1562          * new, reuse the existing bitmap.  If logging is enabled only in the
1563          * new and KVM isn't using a ring buffer, allocate and initialize a
1564          * new bitmap.
1565          */
1566         if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1567                 new->dirty_bitmap = NULL;
1568         else if (old->dirty_bitmap)
1569                 new->dirty_bitmap = old->dirty_bitmap;
1570         else if (!kvm->dirty_ring_size) {
1571                 r = kvm_alloc_dirty_bitmap(new);
1572                 if (r)
1573                         return r;
1574
1575                 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1576                         bitmap_set(new->dirty_bitmap, 0, new->npages);
1577         }
1578
1579         r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1580
1581         /* Free the bitmap on failure if it was allocated above. */
1582         if (r && new->dirty_bitmap && !old->dirty_bitmap)
1583                 kvm_destroy_dirty_bitmap(new);
1584
1585         return r;
1586 }
1587
1588 static void kvm_commit_memory_region(struct kvm *kvm,
1589                                      struct kvm_memory_slot *old,
1590                                      const struct kvm_memory_slot *new,
1591                                      enum kvm_mr_change change)
1592 {
1593         /*
1594          * Update the total number of memslot pages before calling the arch
1595          * hook so that architectures can consume the result directly.
1596          */
1597         if (change == KVM_MR_DELETE)
1598                 kvm->nr_memslot_pages -= old->npages;
1599         else if (change == KVM_MR_CREATE)
1600                 kvm->nr_memslot_pages += new->npages;
1601
1602         kvm_arch_commit_memory_region(kvm, old, new, change);
1603
1604         /*
1605          * Free the old memslot's metadata.  On DELETE, free the whole thing,
1606          * otherwise free the dirty bitmap as needed (the below effectively
1607          * checks both the flags and whether a ring buffer is being used).
1608          */
1609         if (change == KVM_MR_DELETE)
1610                 kvm_free_memslot(kvm, old);
1611         else if (old->dirty_bitmap && !new->dirty_bitmap)
1612                 kvm_destroy_dirty_bitmap(old);
1613 }
1614
1615 static int kvm_set_memslot(struct kvm *kvm,
1616                            struct kvm_memory_slot *new,
1617                            enum kvm_mr_change change)
1618 {
1619         struct kvm_memory_slot *slot, old;
1620         struct kvm_memslots *slots;
1621         int r;
1622
1623         /*
1624          * Released in install_new_memslots.
1625          *
1626          * Must be held from before the current memslots are copied until
1627          * after the new memslots are installed with rcu_assign_pointer,
1628          * then released before the synchronize srcu in install_new_memslots.
1629          *
1630          * When modifying memslots outside of the slots_lock, must be held
1631          * before reading the pointer to the current memslots until after all
1632          * changes to those memslots are complete.
1633          *
1634          * These rules ensure that installing new memslots does not lose
1635          * changes made to the previous memslots.
1636          */
1637         mutex_lock(&kvm->slots_arch_lock);
1638
1639         slots = kvm_dup_memslots(__kvm_memslots(kvm, new->as_id), change);
1640         if (!slots) {
1641                 mutex_unlock(&kvm->slots_arch_lock);
1642                 return -ENOMEM;
1643         }
1644
1645         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1646                 /*
1647                  * Note, the INVALID flag needs to be in the appropriate entry
1648                  * in the freshly allocated memslots, not in @old or @new.
1649                  */
1650                 slot = id_to_memslot(slots, new->id);
1651                 slot->flags |= KVM_MEMSLOT_INVALID;
1652
1653                 /*
1654                  * We can re-use the old memslots, the only difference from the
1655                  * newly installed memslots is the invalid flag, which will get
1656                  * dropped by update_memslots anyway.  We'll also revert to the
1657                  * old memslots if preparing the new memory region fails.
1658                  */
1659                 slots = install_new_memslots(kvm, new->as_id, slots);
1660
1661                 /* From this point no new shadow pages pointing to a deleted,
1662                  * or moved, memslot will be created.
1663                  *
1664                  * validation of sp->gfn happens in:
1665                  *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1666                  *      - kvm_is_visible_gfn (mmu_check_root)
1667                  */
1668                 kvm_arch_flush_shadow_memslot(kvm, slot);
1669
1670                 /* Released in install_new_memslots. */
1671                 mutex_lock(&kvm->slots_arch_lock);
1672
1673                 /*
1674                  * The arch-specific fields of the now-active memslots could
1675                  * have been modified between releasing slots_arch_lock in
1676                  * install_new_memslots and re-acquiring slots_arch_lock above.
1677                  * Copy them to the inactive memslots.  Arch code is required
1678                  * to retrieve memslots *after* acquiring slots_arch_lock, thus
1679                  * the active memslots are guaranteed to be fresh.
1680                  */
1681                 kvm_copy_memslots_arch(slots, __kvm_memslots(kvm, new->as_id));
1682         }
1683
1684         /*
1685          * Make a full copy of the old memslot, the pointer will become stale
1686          * when the memslots are re-sorted by update_memslots(), and the old
1687          * memslot needs to be referenced after calling update_memslots(), e.g.
1688          * to free its resources and for arch specific behavior.  This needs to
1689          * happen *after* (re)acquiring slots_arch_lock.
1690          */
1691         slot = id_to_memslot(slots, new->id);
1692         if (slot) {
1693                 old = *slot;
1694         } else {
1695                 WARN_ON_ONCE(change != KVM_MR_CREATE);
1696                 memset(&old, 0, sizeof(old));
1697                 old.id = new->id;
1698                 old.as_id = new->as_id;
1699         }
1700
1701         r = kvm_prepare_memory_region(kvm, &old, new, change);
1702         if (r)
1703                 goto out_slots;
1704
1705         update_memslots(slots, new, change);
1706         slots = install_new_memslots(kvm, new->as_id, slots);
1707
1708         kvm_commit_memory_region(kvm, &old, new, change);
1709
1710         kvfree(slots);
1711         return 0;
1712
1713 out_slots:
1714         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1715                 slots = install_new_memslots(kvm, new->as_id, slots);
1716         else
1717                 mutex_unlock(&kvm->slots_arch_lock);
1718         kvfree(slots);
1719         return r;
1720 }
1721
1722 /*
1723  * Allocate some memory and give it an address in the guest physical address
1724  * space.
1725  *
1726  * Discontiguous memory is allowed, mostly for framebuffers.
1727  *
1728  * Must be called holding kvm->slots_lock for write.
1729  */
1730 int __kvm_set_memory_region(struct kvm *kvm,
1731                             const struct kvm_userspace_memory_region *mem)
1732 {
1733         struct kvm_memory_slot old, new;
1734         struct kvm_memory_slot *tmp;
1735         enum kvm_mr_change change;
1736         int as_id, id;
1737         int r;
1738
1739         r = check_memory_region_flags(mem);
1740         if (r)
1741                 return r;
1742
1743         as_id = mem->slot >> 16;
1744         id = (u16)mem->slot;
1745
1746         /* General sanity checks */
1747         if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1748             (mem->memory_size != (unsigned long)mem->memory_size))
1749                 return -EINVAL;
1750         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1751                 return -EINVAL;
1752         /* We can read the guest memory with __xxx_user() later on. */
1753         if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1754             (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1755              !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1756                         mem->memory_size))
1757                 return -EINVAL;
1758         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1759                 return -EINVAL;
1760         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1761                 return -EINVAL;
1762
1763         /*
1764          * Make a full copy of the old memslot, the pointer will become stale
1765          * when the memslots are re-sorted by update_memslots(), and the old
1766          * memslot needs to be referenced after calling update_memslots(), e.g.
1767          * to free its resources and for arch specific behavior.
1768          */
1769         tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1770         if (tmp) {
1771                 old = *tmp;
1772                 tmp = NULL;
1773         } else {
1774                 memset(&old, 0, sizeof(old));
1775                 old.id = id;
1776         }
1777
1778         if (!mem->memory_size) {
1779                 if (!old.npages)
1780                         return -EINVAL;
1781
1782                 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old.npages))
1783                         return -EIO;
1784
1785                 memset(&new, 0, sizeof(new));
1786                 new.id = id;
1787                 new.as_id = as_id;
1788
1789                 return kvm_set_memslot(kvm, &new, KVM_MR_DELETE);
1790         }
1791
1792         new.as_id = as_id;
1793         new.id = id;
1794         new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1795         new.npages = mem->memory_size >> PAGE_SHIFT;
1796         new.flags = mem->flags;
1797         new.userspace_addr = mem->userspace_addr;
1798
1799         if (new.npages > KVM_MEM_MAX_NR_PAGES)
1800                 return -EINVAL;
1801
1802         if (!old.npages) {
1803                 change = KVM_MR_CREATE;
1804
1805                 /*
1806                  * To simplify KVM internals, the total number of pages across
1807                  * all memslots must fit in an unsigned long.
1808                  */
1809                 if ((kvm->nr_memslot_pages + new.npages) < kvm->nr_memslot_pages)
1810                         return -EINVAL;
1811         } else { /* Modify an existing slot. */
1812                 if ((new.userspace_addr != old.userspace_addr) ||
1813                     (new.npages != old.npages) ||
1814                     ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1815                         return -EINVAL;
1816
1817                 if (new.base_gfn != old.base_gfn)
1818                         change = KVM_MR_MOVE;
1819                 else if (new.flags != old.flags)
1820                         change = KVM_MR_FLAGS_ONLY;
1821                 else /* Nothing to change. */
1822                         return 0;
1823         }
1824
1825         if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1826                 /* Check for overlaps */
1827                 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1828                         if (tmp->id == id)
1829                                 continue;
1830                         if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1831                               (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1832                                 return -EEXIST;
1833                 }
1834         }
1835
1836         return kvm_set_memslot(kvm, &new, change);
1837 }
1838 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1839
1840 int kvm_set_memory_region(struct kvm *kvm,
1841                           const struct kvm_userspace_memory_region *mem)
1842 {
1843         int r;
1844
1845         mutex_lock(&kvm->slots_lock);
1846         r = __kvm_set_memory_region(kvm, mem);
1847         mutex_unlock(&kvm->slots_lock);
1848         return r;
1849 }
1850 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1851
1852 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1853                                           struct kvm_userspace_memory_region *mem)
1854 {
1855         if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1856                 return -EINVAL;
1857
1858         return kvm_set_memory_region(kvm, mem);
1859 }
1860
1861 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1862 /**
1863  * kvm_get_dirty_log - get a snapshot of dirty pages
1864  * @kvm:        pointer to kvm instance
1865  * @log:        slot id and address to which we copy the log
1866  * @is_dirty:   set to '1' if any dirty pages were found
1867  * @memslot:    set to the associated memslot, always valid on success
1868  */
1869 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1870                       int *is_dirty, struct kvm_memory_slot **memslot)
1871 {
1872         struct kvm_memslots *slots;
1873         int i, as_id, id;
1874         unsigned long n;
1875         unsigned long any = 0;
1876
1877         /* Dirty ring tracking is exclusive to dirty log tracking */
1878         if (kvm->dirty_ring_size)
1879                 return -ENXIO;
1880
1881         *memslot = NULL;
1882         *is_dirty = 0;
1883
1884         as_id = log->slot >> 16;
1885         id = (u16)log->slot;
1886         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1887                 return -EINVAL;
1888
1889         slots = __kvm_memslots(kvm, as_id);
1890         *memslot = id_to_memslot(slots, id);
1891         if (!(*memslot) || !(*memslot)->dirty_bitmap)
1892                 return -ENOENT;
1893
1894         kvm_arch_sync_dirty_log(kvm, *memslot);
1895
1896         n = kvm_dirty_bitmap_bytes(*memslot);
1897
1898         for (i = 0; !any && i < n/sizeof(long); ++i)
1899                 any = (*memslot)->dirty_bitmap[i];
1900
1901         if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1902                 return -EFAULT;
1903
1904         if (any)
1905                 *is_dirty = 1;
1906         return 0;
1907 }
1908 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1909
1910 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
1911 /**
1912  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
1913  *      and reenable dirty page tracking for the corresponding pages.
1914  * @kvm:        pointer to kvm instance
1915  * @log:        slot id and address to which we copy the log
1916  *
1917  * We need to keep it in mind that VCPU threads can write to the bitmap
1918  * concurrently. So, to avoid losing track of dirty pages we keep the
1919  * following order:
1920  *
1921  *    1. Take a snapshot of the bit and clear it if needed.
1922  *    2. Write protect the corresponding page.
1923  *    3. Copy the snapshot to the userspace.
1924  *    4. Upon return caller flushes TLB's if needed.
1925  *
1926  * Between 2 and 4, the guest may write to the page using the remaining TLB
1927  * entry.  This is not a problem because the page is reported dirty using
1928  * the snapshot taken before and step 4 ensures that writes done after
1929  * exiting to userspace will be logged for the next call.
1930  *
1931  */
1932 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
1933 {
1934         struct kvm_memslots *slots;
1935         struct kvm_memory_slot *memslot;
1936         int i, as_id, id;
1937         unsigned long n;
1938         unsigned long *dirty_bitmap;
1939         unsigned long *dirty_bitmap_buffer;
1940         bool flush;
1941
1942         /* Dirty ring tracking is exclusive to dirty log tracking */
1943         if (kvm->dirty_ring_size)
1944                 return -ENXIO;
1945
1946         as_id = log->slot >> 16;
1947         id = (u16)log->slot;
1948         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1949                 return -EINVAL;
1950
1951         slots = __kvm_memslots(kvm, as_id);
1952         memslot = id_to_memslot(slots, id);
1953         if (!memslot || !memslot->dirty_bitmap)
1954                 return -ENOENT;
1955
1956         dirty_bitmap = memslot->dirty_bitmap;
1957
1958         kvm_arch_sync_dirty_log(kvm, memslot);
1959
1960         n = kvm_dirty_bitmap_bytes(memslot);
1961         flush = false;
1962         if (kvm->manual_dirty_log_protect) {
1963                 /*
1964                  * Unlike kvm_get_dirty_log, we always return false in *flush,
1965                  * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
1966                  * is some code duplication between this function and
1967                  * kvm_get_dirty_log, but hopefully all architecture
1968                  * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
1969                  * can be eliminated.
1970                  */
1971                 dirty_bitmap_buffer = dirty_bitmap;
1972         } else {
1973                 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1974                 memset(dirty_bitmap_buffer, 0, n);
1975
1976                 KVM_MMU_LOCK(kvm);
1977                 for (i = 0; i < n / sizeof(long); i++) {
1978                         unsigned long mask;
1979                         gfn_t offset;
1980
1981                         if (!dirty_bitmap[i])
1982                                 continue;
1983
1984                         flush = true;
1985                         mask = xchg(&dirty_bitmap[i], 0);
1986                         dirty_bitmap_buffer[i] = mask;
1987
1988                         offset = i * BITS_PER_LONG;
1989                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1990                                                                 offset, mask);
1991                 }
1992                 KVM_MMU_UNLOCK(kvm);
1993         }
1994
1995         if (flush)
1996                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1997
1998         if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1999                 return -EFAULT;
2000         return 0;
2001 }
2002
2003
2004 /**
2005  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2006  * @kvm: kvm instance
2007  * @log: slot id and address to which we copy the log
2008  *
2009  * Steps 1-4 below provide general overview of dirty page logging. See
2010  * kvm_get_dirty_log_protect() function description for additional details.
2011  *
2012  * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2013  * always flush the TLB (step 4) even if previous step failed  and the dirty
2014  * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2015  * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2016  * writes will be marked dirty for next log read.
2017  *
2018  *   1. Take a snapshot of the bit and clear it if needed.
2019  *   2. Write protect the corresponding page.
2020  *   3. Copy the snapshot to the userspace.
2021  *   4. Flush TLB's if needed.
2022  */
2023 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2024                                       struct kvm_dirty_log *log)
2025 {
2026         int r;
2027
2028         mutex_lock(&kvm->slots_lock);
2029
2030         r = kvm_get_dirty_log_protect(kvm, log);
2031
2032         mutex_unlock(&kvm->slots_lock);
2033         return r;
2034 }
2035
2036 /**
2037  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2038  *      and reenable dirty page tracking for the corresponding pages.
2039  * @kvm:        pointer to kvm instance
2040  * @log:        slot id and address from which to fetch the bitmap of dirty pages
2041  */
2042 static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2043                                        struct kvm_clear_dirty_log *log)
2044 {
2045         struct kvm_memslots *slots;
2046         struct kvm_memory_slot *memslot;
2047         int as_id, id;
2048         gfn_t offset;
2049         unsigned long i, n;
2050         unsigned long *dirty_bitmap;
2051         unsigned long *dirty_bitmap_buffer;
2052         bool flush;
2053
2054         /* Dirty ring tracking is exclusive to dirty log tracking */
2055         if (kvm->dirty_ring_size)
2056                 return -ENXIO;
2057
2058         as_id = log->slot >> 16;
2059         id = (u16)log->slot;
2060         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2061                 return -EINVAL;
2062
2063         if (log->first_page & 63)
2064                 return -EINVAL;
2065
2066         slots = __kvm_memslots(kvm, as_id);
2067         memslot = id_to_memslot(slots, id);
2068         if (!memslot || !memslot->dirty_bitmap)
2069                 return -ENOENT;
2070
2071         dirty_bitmap = memslot->dirty_bitmap;
2072
2073         n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2074
2075         if (log->first_page > memslot->npages ||
2076             log->num_pages > memslot->npages - log->first_page ||
2077             (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2078             return -EINVAL;
2079
2080         kvm_arch_sync_dirty_log(kvm, memslot);
2081
2082         flush = false;
2083         dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2084         if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2085                 return -EFAULT;
2086
2087         KVM_MMU_LOCK(kvm);
2088         for (offset = log->first_page, i = offset / BITS_PER_LONG,
2089                  n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2090              i++, offset += BITS_PER_LONG) {
2091                 unsigned long mask = *dirty_bitmap_buffer++;
2092                 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2093                 if (!mask)
2094                         continue;
2095
2096                 mask &= atomic_long_fetch_andnot(mask, p);
2097
2098                 /*
2099                  * mask contains the bits that really have been cleared.  This
2100                  * never includes any bits beyond the length of the memslot (if
2101                  * the length is not aligned to 64 pages), therefore it is not
2102                  * a problem if userspace sets them in log->dirty_bitmap.
2103                 */
2104                 if (mask) {
2105                         flush = true;
2106                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2107                                                                 offset, mask);
2108                 }
2109         }
2110         KVM_MMU_UNLOCK(kvm);
2111
2112         if (flush)
2113                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2114
2115         return 0;
2116 }
2117
2118 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2119                                         struct kvm_clear_dirty_log *log)
2120 {
2121         int r;
2122
2123         mutex_lock(&kvm->slots_lock);
2124
2125         r = kvm_clear_dirty_log_protect(kvm, log);
2126
2127         mutex_unlock(&kvm->slots_lock);
2128         return r;
2129 }
2130 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2131
2132 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2133 {
2134         return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2135 }
2136 EXPORT_SYMBOL_GPL(gfn_to_memslot);
2137
2138 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2139 {
2140         struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2141         struct kvm_memory_slot *slot;
2142         int slot_index;
2143
2144         slot = try_get_memslot(slots, vcpu->last_used_slot, gfn);
2145         if (slot)
2146                 return slot;
2147
2148         /*
2149          * Fall back to searching all memslots. We purposely use
2150          * search_memslots() instead of __gfn_to_memslot() to avoid
2151          * thrashing the VM-wide last_used_index in kvm_memslots.
2152          */
2153         slot = search_memslots(slots, gfn, &slot_index);
2154         if (slot) {
2155                 vcpu->last_used_slot = slot_index;
2156                 return slot;
2157         }
2158
2159         return NULL;
2160 }
2161 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
2162
2163 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2164 {
2165         struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2166
2167         return kvm_is_visible_memslot(memslot);
2168 }
2169 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2170
2171 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2172 {
2173         struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2174
2175         return kvm_is_visible_memslot(memslot);
2176 }
2177 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2178
2179 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2180 {
2181         struct vm_area_struct *vma;
2182         unsigned long addr, size;
2183
2184         size = PAGE_SIZE;
2185
2186         addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2187         if (kvm_is_error_hva(addr))
2188                 return PAGE_SIZE;
2189
2190         mmap_read_lock(current->mm);
2191         vma = find_vma(current->mm, addr);
2192         if (!vma)
2193                 goto out;
2194
2195         size = vma_kernel_pagesize(vma);
2196
2197 out:
2198         mmap_read_unlock(current->mm);
2199
2200         return size;
2201 }
2202
2203 static bool memslot_is_readonly(struct kvm_memory_slot *slot)
2204 {
2205         return slot->flags & KVM_MEM_READONLY;
2206 }
2207
2208 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2209                                        gfn_t *nr_pages, bool write)
2210 {
2211         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2212                 return KVM_HVA_ERR_BAD;
2213
2214         if (memslot_is_readonly(slot) && write)
2215                 return KVM_HVA_ERR_RO_BAD;
2216
2217         if (nr_pages)
2218                 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2219
2220         return __gfn_to_hva_memslot(slot, gfn);
2221 }
2222
2223 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2224                                      gfn_t *nr_pages)
2225 {
2226         return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2227 }
2228
2229 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2230                                         gfn_t gfn)
2231 {
2232         return gfn_to_hva_many(slot, gfn, NULL);
2233 }
2234 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2235
2236 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2237 {
2238         return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2239 }
2240 EXPORT_SYMBOL_GPL(gfn_to_hva);
2241
2242 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2243 {
2244         return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2245 }
2246 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2247
2248 /*
2249  * Return the hva of a @gfn and the R/W attribute if possible.
2250  *
2251  * @slot: the kvm_memory_slot which contains @gfn
2252  * @gfn: the gfn to be translated
2253  * @writable: used to return the read/write attribute of the @slot if the hva
2254  * is valid and @writable is not NULL
2255  */
2256 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2257                                       gfn_t gfn, bool *writable)
2258 {
2259         unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2260
2261         if (!kvm_is_error_hva(hva) && writable)
2262                 *writable = !memslot_is_readonly(slot);
2263
2264         return hva;
2265 }
2266
2267 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2268 {
2269         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2270
2271         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2272 }
2273
2274 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2275 {
2276         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2277
2278         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2279 }
2280
2281 static inline int check_user_page_hwpoison(unsigned long addr)
2282 {
2283         int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2284
2285         rc = get_user_pages(addr, 1, flags, NULL, NULL);
2286         return rc == -EHWPOISON;
2287 }
2288
2289 /*
2290  * The fast path to get the writable pfn which will be stored in @pfn,
2291  * true indicates success, otherwise false is returned.  It's also the
2292  * only part that runs if we can in atomic context.
2293  */
2294 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2295                             bool *writable, kvm_pfn_t *pfn)
2296 {
2297         struct page *page[1];
2298
2299         /*
2300          * Fast pin a writable pfn only if it is a write fault request
2301          * or the caller allows to map a writable pfn for a read fault
2302          * request.
2303          */
2304         if (!(write_fault || writable))
2305                 return false;
2306
2307         if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2308                 *pfn = page_to_pfn(page[0]);
2309
2310                 if (writable)
2311                         *writable = true;
2312                 return true;
2313         }
2314
2315         return false;
2316 }
2317
2318 /*
2319  * The slow path to get the pfn of the specified host virtual address,
2320  * 1 indicates success, -errno is returned if error is detected.
2321  */
2322 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2323                            bool *writable, kvm_pfn_t *pfn)
2324 {
2325         unsigned int flags = FOLL_HWPOISON;
2326         struct page *page;
2327         int npages = 0;
2328
2329         might_sleep();
2330
2331         if (writable)
2332                 *writable = write_fault;
2333
2334         if (write_fault)
2335                 flags |= FOLL_WRITE;
2336         if (async)
2337                 flags |= FOLL_NOWAIT;
2338
2339         npages = get_user_pages_unlocked(addr, 1, &page, flags);
2340         if (npages != 1)
2341                 return npages;
2342
2343         /* map read fault as writable if possible */
2344         if (unlikely(!write_fault) && writable) {
2345                 struct page *wpage;
2346
2347                 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2348                         *writable = true;
2349                         put_page(page);
2350                         page = wpage;
2351                 }
2352         }
2353         *pfn = page_to_pfn(page);
2354         return npages;
2355 }
2356
2357 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2358 {
2359         if (unlikely(!(vma->vm_flags & VM_READ)))
2360                 return false;
2361
2362         if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2363                 return false;
2364
2365         return true;
2366 }
2367
2368 static int kvm_try_get_pfn(kvm_pfn_t pfn)
2369 {
2370         if (kvm_is_reserved_pfn(pfn))
2371                 return 1;
2372         return get_page_unless_zero(pfn_to_page(pfn));
2373 }
2374
2375 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2376                                unsigned long addr, bool *async,
2377                                bool write_fault, bool *writable,
2378                                kvm_pfn_t *p_pfn)
2379 {
2380         kvm_pfn_t pfn;
2381         pte_t *ptep;
2382         spinlock_t *ptl;
2383         int r;
2384
2385         r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2386         if (r) {
2387                 /*
2388                  * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2389                  * not call the fault handler, so do it here.
2390                  */
2391                 bool unlocked = false;
2392                 r = fixup_user_fault(current->mm, addr,
2393                                      (write_fault ? FAULT_FLAG_WRITE : 0),
2394                                      &unlocked);
2395                 if (unlocked)
2396                         return -EAGAIN;
2397                 if (r)
2398                         return r;
2399
2400                 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2401                 if (r)
2402                         return r;
2403         }
2404
2405         if (write_fault && !pte_write(*ptep)) {
2406                 pfn = KVM_PFN_ERR_RO_FAULT;
2407                 goto out;
2408         }
2409
2410         if (writable)
2411                 *writable = pte_write(*ptep);
2412         pfn = pte_pfn(*ptep);
2413
2414         /*
2415          * Get a reference here because callers of *hva_to_pfn* and
2416          * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2417          * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
2418          * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2419          * simply do nothing for reserved pfns.
2420          *
2421          * Whoever called remap_pfn_range is also going to call e.g.
2422          * unmap_mapping_range before the underlying pages are freed,
2423          * causing a call to our MMU notifier.
2424          *
2425          * Certain IO or PFNMAP mappings can be backed with valid
2426          * struct pages, but be allocated without refcounting e.g.,
2427          * tail pages of non-compound higher order allocations, which
2428          * would then underflow the refcount when the caller does the
2429          * required put_page. Don't allow those pages here.
2430          */
2431         if (!kvm_try_get_pfn(pfn))
2432                 r = -EFAULT;
2433
2434 out:
2435         pte_unmap_unlock(ptep, ptl);
2436         *p_pfn = pfn;
2437
2438         return r;
2439 }
2440
2441 /*
2442  * Pin guest page in memory and return its pfn.
2443  * @addr: host virtual address which maps memory to the guest
2444  * @atomic: whether this function can sleep
2445  * @async: whether this function need to wait IO complete if the
2446  *         host page is not in the memory
2447  * @write_fault: whether we should get a writable host page
2448  * @writable: whether it allows to map a writable host page for !@write_fault
2449  *
2450  * The function will map a writable host page for these two cases:
2451  * 1): @write_fault = true
2452  * 2): @write_fault = false && @writable, @writable will tell the caller
2453  *     whether the mapping is writable.
2454  */
2455 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2456                         bool write_fault, bool *writable)
2457 {
2458         struct vm_area_struct *vma;
2459         kvm_pfn_t pfn = 0;
2460         int npages, r;
2461
2462         /* we can do it either atomically or asynchronously, not both */
2463         BUG_ON(atomic && async);
2464
2465         if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2466                 return pfn;
2467
2468         if (atomic)
2469                 return KVM_PFN_ERR_FAULT;
2470
2471         npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2472         if (npages == 1)
2473                 return pfn;
2474
2475         mmap_read_lock(current->mm);
2476         if (npages == -EHWPOISON ||
2477               (!async && check_user_page_hwpoison(addr))) {
2478                 pfn = KVM_PFN_ERR_HWPOISON;
2479                 goto exit;
2480         }
2481
2482 retry:
2483         vma = vma_lookup(current->mm, addr);
2484
2485         if (vma == NULL)
2486                 pfn = KVM_PFN_ERR_FAULT;
2487         else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2488                 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
2489                 if (r == -EAGAIN)
2490                         goto retry;
2491                 if (r < 0)
2492                         pfn = KVM_PFN_ERR_FAULT;
2493         } else {
2494                 if (async && vma_is_valid(vma, write_fault))
2495                         *async = true;
2496                 pfn = KVM_PFN_ERR_FAULT;
2497         }
2498 exit:
2499         mmap_read_unlock(current->mm);
2500         return pfn;
2501 }
2502
2503 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
2504                                bool atomic, bool *async, bool write_fault,
2505                                bool *writable, hva_t *hva)
2506 {
2507         unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2508
2509         if (hva)
2510                 *hva = addr;
2511
2512         if (addr == KVM_HVA_ERR_RO_BAD) {
2513                 if (writable)
2514                         *writable = false;
2515                 return KVM_PFN_ERR_RO_FAULT;
2516         }
2517
2518         if (kvm_is_error_hva(addr)) {
2519                 if (writable)
2520                         *writable = false;
2521                 return KVM_PFN_NOSLOT;
2522         }
2523
2524         /* Do not map writable pfn in the readonly memslot. */
2525         if (writable && memslot_is_readonly(slot)) {
2526                 *writable = false;
2527                 writable = NULL;
2528         }
2529
2530         return hva_to_pfn(addr, atomic, async, write_fault,
2531                           writable);
2532 }
2533 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2534
2535 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2536                       bool *writable)
2537 {
2538         return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2539                                     write_fault, writable, NULL);
2540 }
2541 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2542
2543 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
2544 {
2545         return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
2546 }
2547 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2548
2549 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
2550 {
2551         return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
2552 }
2553 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2554
2555 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2556 {
2557         return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2558 }
2559 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2560
2561 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2562 {
2563         return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2564 }
2565 EXPORT_SYMBOL_GPL(gfn_to_pfn);
2566
2567 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2568 {
2569         return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2570 }
2571 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2572
2573 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2574                             struct page **pages, int nr_pages)
2575 {
2576         unsigned long addr;
2577         gfn_t entry = 0;
2578
2579         addr = gfn_to_hva_many(slot, gfn, &entry);
2580         if (kvm_is_error_hva(addr))
2581                 return -1;
2582
2583         if (entry < nr_pages)
2584                 return 0;
2585
2586         return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2587 }
2588 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2589
2590 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2591 {
2592         if (is_error_noslot_pfn(pfn))
2593                 return KVM_ERR_PTR_BAD_PAGE;
2594
2595         if (kvm_is_reserved_pfn(pfn)) {
2596                 WARN_ON(1);
2597                 return KVM_ERR_PTR_BAD_PAGE;
2598         }
2599
2600         return pfn_to_page(pfn);
2601 }
2602
2603 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2604 {
2605         kvm_pfn_t pfn;
2606
2607         pfn = gfn_to_pfn(kvm, gfn);
2608
2609         return kvm_pfn_to_page(pfn);
2610 }
2611 EXPORT_SYMBOL_GPL(gfn_to_page);
2612
2613 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
2614 {
2615         if (pfn == 0)
2616                 return;
2617
2618         if (dirty)
2619                 kvm_release_pfn_dirty(pfn);
2620         else
2621                 kvm_release_pfn_clean(pfn);
2622 }
2623
2624 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2625 {
2626         kvm_pfn_t pfn;
2627         void *hva = NULL;
2628         struct page *page = KVM_UNMAPPED_PAGE;
2629
2630         if (!map)
2631                 return -EINVAL;
2632
2633         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2634         if (is_error_noslot_pfn(pfn))
2635                 return -EINVAL;
2636
2637         if (pfn_valid(pfn)) {
2638                 page = pfn_to_page(pfn);
2639                 hva = kmap(page);
2640 #ifdef CONFIG_HAS_IOMEM
2641         } else {
2642                 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2643 #endif
2644         }
2645
2646         if (!hva)
2647                 return -EFAULT;
2648
2649         map->page = page;
2650         map->hva = hva;
2651         map->pfn = pfn;
2652         map->gfn = gfn;
2653
2654         return 0;
2655 }
2656 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2657
2658 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2659 {
2660         if (!map)
2661                 return;
2662
2663         if (!map->hva)
2664                 return;
2665
2666         if (map->page != KVM_UNMAPPED_PAGE)
2667                 kunmap(map->page);
2668 #ifdef CONFIG_HAS_IOMEM
2669         else
2670                 memunmap(map->hva);
2671 #endif
2672
2673         if (dirty)
2674                 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
2675
2676         kvm_release_pfn(map->pfn, dirty);
2677
2678         map->hva = NULL;
2679         map->page = NULL;
2680 }
2681 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2682
2683 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2684 {
2685         kvm_pfn_t pfn;
2686
2687         pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2688
2689         return kvm_pfn_to_page(pfn);
2690 }
2691 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2692
2693 void kvm_release_page_clean(struct page *page)
2694 {
2695         WARN_ON(is_error_page(page));
2696
2697         kvm_release_pfn_clean(page_to_pfn(page));
2698 }
2699 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2700
2701 void kvm_release_pfn_clean(kvm_pfn_t pfn)
2702 {
2703         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2704                 put_page(pfn_to_page(pfn));
2705 }
2706 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2707
2708 void kvm_release_page_dirty(struct page *page)
2709 {
2710         WARN_ON(is_error_page(page));
2711
2712         kvm_release_pfn_dirty(page_to_pfn(page));
2713 }
2714 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2715
2716 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2717 {
2718         kvm_set_pfn_dirty(pfn);
2719         kvm_release_pfn_clean(pfn);
2720 }
2721 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2722
2723 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2724 {
2725         if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2726                 SetPageDirty(pfn_to_page(pfn));
2727 }
2728 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2729
2730 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2731 {
2732         if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2733                 mark_page_accessed(pfn_to_page(pfn));
2734 }
2735 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2736
2737 static int next_segment(unsigned long len, int offset)
2738 {
2739         if (len > PAGE_SIZE - offset)
2740                 return PAGE_SIZE - offset;
2741         else
2742                 return len;
2743 }
2744
2745 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2746                                  void *data, int offset, int len)
2747 {
2748         int r;
2749         unsigned long addr;
2750
2751         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2752         if (kvm_is_error_hva(addr))
2753                 return -EFAULT;
2754         r = __copy_from_user(data, (void __user *)addr + offset, len);
2755         if (r)
2756                 return -EFAULT;
2757         return 0;
2758 }
2759
2760 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2761                         int len)
2762 {
2763         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2764
2765         return __kvm_read_guest_page(slot, gfn, data, offset, len);
2766 }
2767 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2768
2769 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2770                              int offset, int len)
2771 {
2772         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2773
2774         return __kvm_read_guest_page(slot, gfn, data, offset, len);
2775 }
2776 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2777
2778 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2779 {
2780         gfn_t gfn = gpa >> PAGE_SHIFT;
2781         int seg;
2782         int offset = offset_in_page(gpa);
2783         int ret;
2784
2785         while ((seg = next_segment(len, offset)) != 0) {
2786                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2787                 if (ret < 0)
2788                         return ret;
2789                 offset = 0;
2790                 len -= seg;
2791                 data += seg;
2792                 ++gfn;
2793         }
2794         return 0;
2795 }
2796 EXPORT_SYMBOL_GPL(kvm_read_guest);
2797
2798 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2799 {
2800         gfn_t gfn = gpa >> PAGE_SHIFT;
2801         int seg;
2802         int offset = offset_in_page(gpa);
2803         int ret;
2804
2805         while ((seg = next_segment(len, offset)) != 0) {
2806                 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2807                 if (ret < 0)
2808                         return ret;
2809                 offset = 0;
2810                 len -= seg;
2811                 data += seg;
2812                 ++gfn;
2813         }
2814         return 0;
2815 }
2816 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2817
2818 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2819                                    void *data, int offset, unsigned long len)
2820 {
2821         int r;
2822         unsigned long addr;
2823
2824         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2825         if (kvm_is_error_hva(addr))
2826                 return -EFAULT;
2827         pagefault_disable();
2828         r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2829         pagefault_enable();
2830         if (r)
2831                 return -EFAULT;
2832         return 0;
2833 }
2834
2835 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2836                                void *data, unsigned long len)
2837 {
2838         gfn_t gfn = gpa >> PAGE_SHIFT;
2839         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2840         int offset = offset_in_page(gpa);
2841
2842         return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2843 }
2844 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2845
2846 static int __kvm_write_guest_page(struct kvm *kvm,
2847                                   struct kvm_memory_slot *memslot, gfn_t gfn,
2848                                   const void *data, int offset, int len)
2849 {
2850         int r;
2851         unsigned long addr;
2852
2853         addr = gfn_to_hva_memslot(memslot, gfn);
2854         if (kvm_is_error_hva(addr))
2855                 return -EFAULT;
2856         r = __copy_to_user((void __user *)addr + offset, data, len);
2857         if (r)
2858                 return -EFAULT;
2859         mark_page_dirty_in_slot(kvm, memslot, gfn);
2860         return 0;
2861 }
2862
2863 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2864                          const void *data, int offset, int len)
2865 {
2866         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2867
2868         return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2869 }
2870 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2871
2872 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2873                               const void *data, int offset, int len)
2874 {
2875         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2876
2877         return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2878 }
2879 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2880
2881 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2882                     unsigned long len)
2883 {
2884         gfn_t gfn = gpa >> PAGE_SHIFT;
2885         int seg;
2886         int offset = offset_in_page(gpa);
2887         int ret;
2888
2889         while ((seg = next_segment(len, offset)) != 0) {
2890                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2891                 if (ret < 0)
2892                         return ret;
2893                 offset = 0;
2894                 len -= seg;
2895                 data += seg;
2896                 ++gfn;
2897         }
2898         return 0;
2899 }
2900 EXPORT_SYMBOL_GPL(kvm_write_guest);
2901
2902 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2903                          unsigned long len)
2904 {
2905         gfn_t gfn = gpa >> PAGE_SHIFT;
2906         int seg;
2907         int offset = offset_in_page(gpa);
2908         int ret;
2909
2910         while ((seg = next_segment(len, offset)) != 0) {
2911                 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2912                 if (ret < 0)
2913                         return ret;
2914                 offset = 0;
2915                 len -= seg;
2916                 data += seg;
2917                 ++gfn;
2918         }
2919         return 0;
2920 }
2921 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2922
2923 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2924                                        struct gfn_to_hva_cache *ghc,
2925                                        gpa_t gpa, unsigned long len)
2926 {
2927         int offset = offset_in_page(gpa);
2928         gfn_t start_gfn = gpa >> PAGE_SHIFT;
2929         gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2930         gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2931         gfn_t nr_pages_avail;
2932
2933         /* Update ghc->generation before performing any error checks. */
2934         ghc->generation = slots->generation;
2935
2936         if (start_gfn > end_gfn) {
2937                 ghc->hva = KVM_HVA_ERR_BAD;
2938                 return -EINVAL;
2939         }
2940
2941         /*
2942          * If the requested region crosses two memslots, we still
2943          * verify that the entire region is valid here.
2944          */
2945         for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2946                 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2947                 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2948                                            &nr_pages_avail);
2949                 if (kvm_is_error_hva(ghc->hva))
2950                         return -EFAULT;
2951         }
2952
2953         /* Use the slow path for cross page reads and writes. */
2954         if (nr_pages_needed == 1)
2955                 ghc->hva += offset;
2956         else
2957                 ghc->memslot = NULL;
2958
2959         ghc->gpa = gpa;
2960         ghc->len = len;
2961         return 0;
2962 }
2963
2964 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2965                               gpa_t gpa, unsigned long len)
2966 {
2967         struct kvm_memslots *slots = kvm_memslots(kvm);
2968         return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2969 }
2970 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2971
2972 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2973                                   void *data, unsigned int offset,
2974                                   unsigned long len)
2975 {
2976         struct kvm_memslots *slots = kvm_memslots(kvm);
2977         int r;
2978         gpa_t gpa = ghc->gpa + offset;
2979
2980         if (WARN_ON_ONCE(len + offset > ghc->len))
2981                 return -EINVAL;
2982
2983         if (slots->generation != ghc->generation) {
2984                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2985                         return -EFAULT;
2986         }
2987
2988         if (kvm_is_error_hva(ghc->hva))
2989                 return -EFAULT;
2990
2991         if (unlikely(!ghc->memslot))
2992                 return kvm_write_guest(kvm, gpa, data, len);
2993
2994         r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
2995         if (r)
2996                 return -EFAULT;
2997         mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
2998
2999         return 0;
3000 }
3001 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3002
3003 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3004                            void *data, unsigned long len)
3005 {
3006         return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3007 }
3008 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3009
3010 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3011                                  void *data, unsigned int offset,
3012                                  unsigned long len)
3013 {
3014         struct kvm_memslots *slots = kvm_memslots(kvm);
3015         int r;
3016         gpa_t gpa = ghc->gpa + offset;
3017
3018         if (WARN_ON_ONCE(len + offset > ghc->len))
3019                 return -EINVAL;
3020
3021         if (slots->generation != ghc->generation) {
3022                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3023                         return -EFAULT;
3024         }
3025
3026         if (kvm_is_error_hva(ghc->hva))
3027                 return -EFAULT;
3028
3029         if (unlikely(!ghc->memslot))
3030                 return kvm_read_guest(kvm, gpa, data, len);
3031
3032         r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3033         if (r)
3034                 return -EFAULT;
3035
3036         return 0;
3037 }
3038 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3039
3040 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3041                           void *data, unsigned long len)
3042 {
3043         return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3044 }
3045 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3046
3047 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3048 {
3049         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3050         gfn_t gfn = gpa >> PAGE_SHIFT;
3051         int seg;
3052         int offset = offset_in_page(gpa);
3053         int ret;
3054
3055         while ((seg = next_segment(len, offset)) != 0) {
3056                 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3057                 if (ret < 0)
3058                         return ret;
3059                 offset = 0;
3060                 len -= seg;
3061                 ++gfn;
3062         }
3063         return 0;
3064 }
3065 EXPORT_SYMBOL_GPL(kvm_clear_guest);
3066
3067 void mark_page_dirty_in_slot(struct kvm *kvm,
3068                              struct kvm_memory_slot *memslot,
3069                              gfn_t gfn)
3070 {
3071         if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3072                 unsigned long rel_gfn = gfn - memslot->base_gfn;
3073                 u32 slot = (memslot->as_id << 16) | memslot->id;
3074
3075                 if (kvm->dirty_ring_size)
3076                         kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
3077                                             slot, rel_gfn);
3078                 else
3079                         set_bit_le(rel_gfn, memslot->dirty_bitmap);
3080         }
3081 }
3082 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3083
3084 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3085 {
3086         struct kvm_memory_slot *memslot;
3087
3088         memslot = gfn_to_memslot(kvm, gfn);
3089         mark_page_dirty_in_slot(kvm, memslot, gfn);
3090 }
3091 EXPORT_SYMBOL_GPL(mark_page_dirty);
3092
3093 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3094 {
3095         struct kvm_memory_slot *memslot;
3096
3097         memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3098         mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3099 }
3100 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3101
3102 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3103 {
3104         if (!vcpu->sigset_active)
3105                 return;
3106
3107         /*
3108          * This does a lockless modification of ->real_blocked, which is fine
3109          * because, only current can change ->real_blocked and all readers of
3110          * ->real_blocked don't care as long ->real_blocked is always a subset
3111          * of ->blocked.
3112          */
3113         sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3114 }
3115
3116 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3117 {
3118         if (!vcpu->sigset_active)
3119                 return;
3120
3121         sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3122         sigemptyset(&current->real_blocked);
3123 }
3124
3125 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3126 {
3127         unsigned int old, val, grow, grow_start;
3128
3129         old = val = vcpu->halt_poll_ns;
3130         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3131         grow = READ_ONCE(halt_poll_ns_grow);
3132         if (!grow)
3133                 goto out;
3134
3135         val *= grow;
3136         if (val < grow_start)
3137                 val = grow_start;
3138
3139         if (val > vcpu->kvm->max_halt_poll_ns)
3140                 val = vcpu->kvm->max_halt_poll_ns;
3141
3142         vcpu->halt_poll_ns = val;
3143 out:
3144         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3145 }
3146
3147 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3148 {
3149         unsigned int old, val, shrink, grow_start;
3150
3151         old = val = vcpu->halt_poll_ns;
3152         shrink = READ_ONCE(halt_poll_ns_shrink);
3153         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3154         if (shrink == 0)
3155                 val = 0;
3156         else
3157                 val /= shrink;
3158
3159         if (val < grow_start)
3160                 val = 0;
3161
3162         vcpu->halt_poll_ns = val;
3163         trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3164 }
3165
3166 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3167 {
3168         int ret = -EINTR;
3169         int idx = srcu_read_lock(&vcpu->kvm->srcu);
3170
3171         if (kvm_arch_vcpu_runnable(vcpu)) {
3172                 kvm_make_request(KVM_REQ_UNHALT, vcpu);
3173                 goto out;
3174         }
3175         if (kvm_cpu_has_pending_timer(vcpu))
3176                 goto out;
3177         if (signal_pending(current))
3178                 goto out;
3179         if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3180                 goto out;
3181
3182         ret = 0;
3183 out:
3184         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3185         return ret;
3186 }
3187
3188 static inline void
3189 update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
3190 {
3191         if (waited)
3192                 vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
3193         else
3194                 vcpu->stat.generic.halt_poll_success_ns += poll_ns;
3195 }
3196
3197 /*
3198  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
3199  */
3200 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
3201 {
3202         ktime_t start, cur, poll_end;
3203         bool waited = false;
3204         u64 block_ns;
3205
3206         kvm_arch_vcpu_blocking(vcpu);
3207
3208         start = cur = poll_end = ktime_get();
3209         if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
3210                 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
3211
3212                 ++vcpu->stat.generic.halt_attempted_poll;
3213                 do {
3214                         /*
3215                          * This sets KVM_REQ_UNHALT if an interrupt
3216                          * arrives.
3217                          */
3218                         if (kvm_vcpu_check_block(vcpu) < 0) {
3219                                 ++vcpu->stat.generic.halt_successful_poll;
3220                                 if (!vcpu_valid_wakeup(vcpu))
3221                                         ++vcpu->stat.generic.halt_poll_invalid;
3222
3223                                 KVM_STATS_LOG_HIST_UPDATE(
3224                                       vcpu->stat.generic.halt_poll_success_hist,
3225                                       ktime_to_ns(ktime_get()) -
3226                                       ktime_to_ns(start));
3227                                 goto out;
3228                         }
3229                         cpu_relax();
3230                         poll_end = cur = ktime_get();
3231                 } while (kvm_vcpu_can_poll(cur, stop));
3232
3233                 KVM_STATS_LOG_HIST_UPDATE(
3234                                 vcpu->stat.generic.halt_poll_fail_hist,
3235                                 ktime_to_ns(ktime_get()) - ktime_to_ns(start));
3236         }
3237
3238
3239         prepare_to_rcuwait(&vcpu->wait);
3240         for (;;) {
3241                 set_current_state(TASK_INTERRUPTIBLE);
3242
3243                 if (kvm_vcpu_check_block(vcpu) < 0)
3244                         break;
3245
3246                 waited = true;
3247                 schedule();
3248         }
3249         finish_rcuwait(&vcpu->wait);
3250         cur = ktime_get();
3251         if (waited) {
3252                 vcpu->stat.generic.halt_wait_ns +=
3253                         ktime_to_ns(cur) - ktime_to_ns(poll_end);
3254                 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3255                                 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3256         }
3257 out:
3258         kvm_arch_vcpu_unblocking(vcpu);
3259         block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3260
3261         update_halt_poll_stats(
3262                 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
3263
3264         if (!kvm_arch_no_poll(vcpu)) {
3265                 if (!vcpu_valid_wakeup(vcpu)) {
3266                         shrink_halt_poll_ns(vcpu);
3267                 } else if (vcpu->kvm->max_halt_poll_ns) {
3268                         if (block_ns <= vcpu->halt_poll_ns)
3269                                 ;
3270                         /* we had a long block, shrink polling */
3271                         else if (vcpu->halt_poll_ns &&
3272                                         block_ns > vcpu->kvm->max_halt_poll_ns)
3273                                 shrink_halt_poll_ns(vcpu);
3274                         /* we had a short halt and our poll time is too small */
3275                         else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3276                                         block_ns < vcpu->kvm->max_halt_poll_ns)
3277                                 grow_halt_poll_ns(vcpu);
3278                 } else {
3279                         vcpu->halt_poll_ns = 0;
3280                 }
3281         }
3282
3283         trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
3284         kvm_arch_vcpu_block_finish(vcpu);
3285 }
3286 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
3287
3288 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3289 {
3290         struct rcuwait *waitp;
3291
3292         waitp = kvm_arch_vcpu_get_wait(vcpu);
3293         if (rcuwait_wake_up(waitp)) {
3294                 WRITE_ONCE(vcpu->ready, true);
3295                 ++vcpu->stat.generic.halt_wakeup;
3296                 return true;
3297         }
3298
3299         return false;
3300 }
3301 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3302
3303 #ifndef CONFIG_S390
3304 /*
3305  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3306  */
3307 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3308 {
3309         int me, cpu;
3310
3311         if (kvm_vcpu_wake_up(vcpu))
3312                 return;
3313
3314         /*
3315          * Note, the vCPU could get migrated to a different pCPU at any point
3316          * after kvm_arch_vcpu_should_kick(), which could result in sending an
3317          * IPI to the previous pCPU.  But, that's ok because the purpose of the
3318          * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3319          * vCPU also requires it to leave IN_GUEST_MODE.
3320          */
3321         me = get_cpu();
3322         if (kvm_arch_vcpu_should_kick(vcpu)) {
3323                 cpu = READ_ONCE(vcpu->cpu);
3324                 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3325                         smp_send_reschedule(cpu);
3326         }
3327         put_cpu();
3328 }
3329 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3330 #endif /* !CONFIG_S390 */
3331
3332 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3333 {
3334         struct pid *pid;
3335         struct task_struct *task = NULL;
3336         int ret = 0;
3337
3338         rcu_read_lock();
3339         pid = rcu_dereference(target->pid);
3340         if (pid)
3341                 task = get_pid_task(pid, PIDTYPE_PID);
3342         rcu_read_unlock();
3343         if (!task)
3344                 return ret;
3345         ret = yield_to(task, 1);
3346         put_task_struct(task);
3347
3348         return ret;
3349 }
3350 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3351
3352 /*
3353  * Helper that checks whether a VCPU is eligible for directed yield.
3354  * Most eligible candidate to yield is decided by following heuristics:
3355  *
3356  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3357  *  (preempted lock holder), indicated by @in_spin_loop.
3358  *  Set at the beginning and cleared at the end of interception/PLE handler.
3359  *
3360  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3361  *  chance last time (mostly it has become eligible now since we have probably
3362  *  yielded to lockholder in last iteration. This is done by toggling
3363  *  @dy_eligible each time a VCPU checked for eligibility.)
3364  *
3365  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3366  *  to preempted lock-holder could result in wrong VCPU selection and CPU
3367  *  burning. Giving priority for a potential lock-holder increases lock
3368  *  progress.
3369  *
3370  *  Since algorithm is based on heuristics, accessing another VCPU data without
3371  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3372  *  and continue with next VCPU and so on.
3373  */
3374 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3375 {
3376 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3377         bool eligible;
3378
3379         eligible = !vcpu->spin_loop.in_spin_loop ||
3380                     vcpu->spin_loop.dy_eligible;
3381
3382         if (vcpu->spin_loop.in_spin_loop)
3383                 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3384
3385         return eligible;
3386 #else
3387         return true;
3388 #endif
3389 }
3390
3391 /*
3392  * Unlike kvm_arch_vcpu_runnable, this function is called outside
3393  * a vcpu_load/vcpu_put pair.  However, for most architectures
3394  * kvm_arch_vcpu_runnable does not require vcpu_load.
3395  */
3396 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3397 {
3398         return kvm_arch_vcpu_runnable(vcpu);
3399 }
3400
3401 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3402 {
3403         if (kvm_arch_dy_runnable(vcpu))
3404                 return true;
3405
3406 #ifdef CONFIG_KVM_ASYNC_PF
3407         if (!list_empty_careful(&vcpu->async_pf.done))
3408                 return true;
3409 #endif
3410
3411         return false;
3412 }
3413
3414 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3415 {
3416         return false;
3417 }
3418
3419 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3420 {
3421         struct kvm *kvm = me->kvm;
3422         struct kvm_vcpu *vcpu;
3423         int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3424         unsigned long i;
3425         int yielded = 0;
3426         int try = 3;
3427         int pass;
3428
3429         kvm_vcpu_set_in_spin_loop(me, true);
3430         /*
3431          * We boost the priority of a VCPU that is runnable but not
3432          * currently running, because it got preempted by something
3433          * else and called schedule in __vcpu_run.  Hopefully that
3434          * VCPU is holding the lock that we need and will release it.
3435          * We approximate round-robin by starting at the last boosted VCPU.
3436          */
3437         for (pass = 0; pass < 2 && !yielded && try; pass++) {
3438                 kvm_for_each_vcpu(i, vcpu, kvm) {
3439                         if (!pass && i <= last_boosted_vcpu) {
3440                                 i = last_boosted_vcpu;
3441                                 continue;
3442                         } else if (pass && i > last_boosted_vcpu)
3443                                 break;
3444                         if (!READ_ONCE(vcpu->ready))
3445                                 continue;
3446                         if (vcpu == me)
3447                                 continue;
3448                         if (rcuwait_active(&vcpu->wait) &&
3449                             !vcpu_dy_runnable(vcpu))
3450                                 continue;
3451                         if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3452                             !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3453                             !kvm_arch_vcpu_in_kernel(vcpu))
3454                                 continue;
3455                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3456                                 continue;
3457
3458                         yielded = kvm_vcpu_yield_to(vcpu);
3459                         if (yielded > 0) {
3460                                 kvm->last_boosted_vcpu = i;
3461                                 break;
3462                         } else if (yielded < 0) {
3463                                 try--;
3464                                 if (!try)
3465                                         break;
3466                         }
3467                 }
3468         }
3469         kvm_vcpu_set_in_spin_loop(me, false);
3470
3471         /* Ensure vcpu is not eligible during next spinloop */
3472         kvm_vcpu_set_dy_eligible(me, false);
3473 }
3474 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3475
3476 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3477 {
3478 #if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3479         return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3480             (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3481              kvm->dirty_ring_size / PAGE_SIZE);
3482 #else
3483         return false;
3484 #endif
3485 }
3486
3487 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3488 {
3489         struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3490         struct page *page;
3491
3492         if (vmf->pgoff == 0)
3493                 page = virt_to_page(vcpu->run);
3494 #ifdef CONFIG_X86
3495         else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3496                 page = virt_to_page(vcpu->arch.pio_data);
3497 #endif
3498 #ifdef CONFIG_KVM_MMIO
3499         else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3500                 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3501 #endif
3502         else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3503                 page = kvm_dirty_ring_get_page(
3504                     &vcpu->dirty_ring,
3505                     vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3506         else
3507                 return kvm_arch_vcpu_fault(vcpu, vmf);
3508         get_page(page);
3509         vmf->page = page;
3510         return 0;
3511 }
3512
3513 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3514         .fault = kvm_vcpu_fault,
3515 };
3516
3517 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3518 {
3519         struct kvm_vcpu *vcpu = file->private_data;
3520         unsigned long pages = vma_pages(vma);
3521
3522         if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3523              kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3524             ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3525                 return -EINVAL;
3526
3527         vma->vm_ops = &kvm_vcpu_vm_ops;
3528         return 0;
3529 }
3530
3531 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3532 {
3533         struct kvm_vcpu *vcpu = filp->private_data;
3534
3535         kvm_put_kvm(vcpu->kvm);
3536         return 0;
3537 }
3538
3539 static struct file_operations kvm_vcpu_fops = {
3540         .release        = kvm_vcpu_release,
3541         .unlocked_ioctl = kvm_vcpu_ioctl,
3542         .mmap           = kvm_vcpu_mmap,
3543         .llseek         = noop_llseek,
3544         KVM_COMPAT(kvm_vcpu_compat_ioctl),
3545 };
3546
3547 /*
3548  * Allocates an inode for the vcpu.
3549  */
3550 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3551 {
3552         char name[8 + 1 + ITOA_MAX_LEN + 1];
3553
3554         snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3555         return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3556 }
3557
3558 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3559 {
3560 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3561         struct dentry *debugfs_dentry;
3562         char dir_name[ITOA_MAX_LEN * 2];
3563
3564         if (!debugfs_initialized())
3565                 return;
3566
3567         snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3568         debugfs_dentry = debugfs_create_dir(dir_name,
3569                                             vcpu->kvm->debugfs_dentry);
3570
3571         kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3572 #endif
3573 }
3574
3575 /*
3576  * Creates some virtual cpus.  Good luck creating more than one.
3577  */
3578 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3579 {
3580         int r;
3581         struct kvm_vcpu *vcpu;
3582         struct page *page;
3583
3584         if (id >= KVM_MAX_VCPU_IDS)
3585                 return -EINVAL;
3586
3587         mutex_lock(&kvm->lock);
3588         if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3589                 mutex_unlock(&kvm->lock);
3590                 return -EINVAL;
3591         }
3592
3593         kvm->created_vcpus++;
3594         mutex_unlock(&kvm->lock);
3595
3596         r = kvm_arch_vcpu_precreate(kvm, id);
3597         if (r)
3598                 goto vcpu_decrement;
3599
3600         vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3601         if (!vcpu) {
3602                 r = -ENOMEM;
3603                 goto vcpu_decrement;
3604         }
3605
3606         BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3607         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3608         if (!page) {
3609                 r = -ENOMEM;
3610                 goto vcpu_free;
3611         }
3612         vcpu->run = page_address(page);
3613
3614         kvm_vcpu_init(vcpu, kvm, id);
3615
3616         r = kvm_arch_vcpu_create(vcpu);
3617         if (r)
3618                 goto vcpu_free_run_page;
3619
3620         if (kvm->dirty_ring_size) {
3621                 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3622                                          id, kvm->dirty_ring_size);
3623                 if (r)
3624                         goto arch_vcpu_destroy;
3625         }
3626
3627         mutex_lock(&kvm->lock);
3628         if (kvm_get_vcpu_by_id(kvm, id)) {
3629                 r = -EEXIST;
3630                 goto unlock_vcpu_destroy;
3631         }
3632
3633         vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3634         r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
3635         BUG_ON(r == -EBUSY);
3636         if (r)
3637                 goto unlock_vcpu_destroy;
3638
3639         /* Fill the stats id string for the vcpu */
3640         snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3641                  task_pid_nr(current), id);
3642
3643         /* Now it's all set up, let userspace reach it */
3644         kvm_get_kvm(kvm);
3645         r = create_vcpu_fd(vcpu);
3646         if (r < 0) {
3647                 xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
3648                 kvm_put_kvm_no_destroy(kvm);
3649                 goto unlock_vcpu_destroy;
3650         }
3651
3652         /*
3653          * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
3654          * pointer before kvm->online_vcpu's incremented value.
3655          */
3656         smp_wmb();
3657         atomic_inc(&kvm->online_vcpus);
3658
3659         mutex_unlock(&kvm->lock);
3660         kvm_arch_vcpu_postcreate(vcpu);
3661         kvm_create_vcpu_debugfs(vcpu);
3662         return r;
3663
3664 unlock_vcpu_destroy:
3665         mutex_unlock(&kvm->lock);
3666         kvm_dirty_ring_free(&vcpu->dirty_ring);
3667 arch_vcpu_destroy:
3668         kvm_arch_vcpu_destroy(vcpu);
3669 vcpu_free_run_page:
3670         free_page((unsigned long)vcpu->run);
3671 vcpu_free:
3672         kmem_cache_free(kvm_vcpu_cache, vcpu);
3673 vcpu_decrement:
3674         mutex_lock(&kvm->lock);
3675         kvm->created_vcpus--;
3676         mutex_unlock(&kvm->lock);
3677         return r;
3678 }
3679
3680 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3681 {
3682         if (sigset) {
3683                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3684                 vcpu->sigset_active = 1;
3685                 vcpu->sigset = *sigset;
3686         } else
3687                 vcpu->sigset_active = 0;
3688         return 0;
3689 }
3690
3691 static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3692                               size_t size, loff_t *offset)
3693 {
3694         struct kvm_vcpu *vcpu = file->private_data;
3695
3696         return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3697                         &kvm_vcpu_stats_desc[0], &vcpu->stat,
3698                         sizeof(vcpu->stat), user_buffer, size, offset);
3699 }
3700
3701 static const struct file_operations kvm_vcpu_stats_fops = {
3702         .read = kvm_vcpu_stats_read,
3703         .llseek = noop_llseek,
3704 };
3705
3706 static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3707 {
3708         int fd;
3709         struct file *file;
3710         char name[15 + ITOA_MAX_LEN + 1];
3711
3712         snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3713
3714         fd = get_unused_fd_flags(O_CLOEXEC);
3715         if (fd < 0)
3716                 return fd;
3717
3718         file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3719         if (IS_ERR(file)) {
3720                 put_unused_fd(fd);
3721                 return PTR_ERR(file);
3722         }
3723         file->f_mode |= FMODE_PREAD;
3724         fd_install(fd, file);
3725
3726         return fd;
3727 }
3728
3729 static long kvm_vcpu_ioctl(struct file *filp,
3730                            unsigned int ioctl, unsigned long arg)
3731 {
3732         struct kvm_vcpu *vcpu = filp->private_data;
3733         void __user *argp = (void __user *)arg;
3734         int r;
3735         struct kvm_fpu *fpu = NULL;
3736         struct kvm_sregs *kvm_sregs = NULL;
3737
3738         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
3739                 return -EIO;
3740
3741         if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3742                 return -EINVAL;
3743
3744         /*
3745          * Some architectures have vcpu ioctls that are asynchronous to vcpu
3746          * execution; mutex_lock() would break them.
3747          */
3748         r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3749         if (r != -ENOIOCTLCMD)
3750                 return r;
3751
3752         if (mutex_lock_killable(&vcpu->mutex))
3753                 return -EINTR;
3754         switch (ioctl) {
3755         case KVM_RUN: {
3756                 struct pid *oldpid;
3757                 r = -EINVAL;
3758                 if (arg)
3759                         goto out;
3760                 oldpid = rcu_access_pointer(vcpu->pid);
3761                 if (unlikely(oldpid != task_pid(current))) {
3762                         /* The thread running this VCPU changed. */
3763                         struct pid *newpid;
3764
3765                         r = kvm_arch_vcpu_run_pid_change(vcpu);
3766                         if (r)
3767                                 break;
3768
3769                         newpid = get_task_pid(current, PIDTYPE_PID);
3770                         rcu_assign_pointer(vcpu->pid, newpid);
3771                         if (oldpid)
3772                                 synchronize_rcu();
3773                         put_pid(oldpid);
3774                 }
3775                 r = kvm_arch_vcpu_ioctl_run(vcpu);
3776                 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3777                 break;
3778         }
3779         case KVM_GET_REGS: {
3780                 struct kvm_regs *kvm_regs;
3781
3782                 r = -ENOMEM;
3783                 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3784                 if (!kvm_regs)
3785                         goto out;
3786                 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3787                 if (r)
3788                         goto out_free1;
3789                 r = -EFAULT;
3790                 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3791                         goto out_free1;
3792                 r = 0;
3793 out_free1:
3794                 kfree(kvm_regs);
3795                 break;
3796         }
3797         case KVM_SET_REGS: {
3798                 struct kvm_regs *kvm_regs;
3799
3800                 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3801                 if (IS_ERR(kvm_regs)) {
3802                         r = PTR_ERR(kvm_regs);
3803                         goto out;
3804                 }
3805                 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3806                 kfree(kvm_regs);
3807                 break;
3808         }
3809         case KVM_GET_SREGS: {
3810                 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3811                                     GFP_KERNEL_ACCOUNT);
3812                 r = -ENOMEM;
3813                 if (!kvm_sregs)
3814                         goto out;
3815                 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3816                 if (r)
3817                         goto out;
3818                 r = -EFAULT;
3819                 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3820                         goto out;
3821                 r = 0;
3822                 break;
3823         }
3824         case KVM_SET_SREGS: {
3825                 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3826                 if (IS_ERR(kvm_sregs)) {
3827                         r = PTR_ERR(kvm_sregs);
3828                         kvm_sregs = NULL;
3829                         goto out;
3830                 }
3831                 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3832                 break;
3833         }
3834         case KVM_GET_MP_STATE: {
3835                 struct kvm_mp_state mp_state;
3836
3837                 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3838                 if (r)
3839                         goto out;
3840                 r = -EFAULT;
3841                 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3842                         goto out;
3843                 r = 0;
3844                 break;
3845         }
3846         case KVM_SET_MP_STATE: {
3847                 struct kvm_mp_state mp_state;
3848
3849                 r = -EFAULT;
3850                 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3851                         goto out;
3852                 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3853                 break;
3854         }
3855         case KVM_TRANSLATE: {
3856                 struct kvm_translation tr;
3857
3858                 r = -EFAULT;
3859                 if (copy_from_user(&tr, argp, sizeof(tr)))
3860                         goto out;
3861                 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3862                 if (r)
3863                         goto out;
3864                 r = -EFAULT;
3865                 if (copy_to_user(argp, &tr, sizeof(tr)))
3866                         goto out;
3867                 r = 0;
3868                 break;
3869         }
3870         case KVM_SET_GUEST_DEBUG: {
3871                 struct kvm_guest_debug dbg;
3872
3873                 r = -EFAULT;
3874                 if (copy_from_user(&dbg, argp, sizeof(dbg)))
3875                         goto out;
3876                 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3877                 break;
3878         }
3879         case KVM_SET_SIGNAL_MASK: {
3880                 struct kvm_signal_mask __user *sigmask_arg = argp;
3881                 struct kvm_signal_mask kvm_sigmask;
3882                 sigset_t sigset, *p;
3883
3884                 p = NULL;
3885                 if (argp) {
3886                         r = -EFAULT;
3887                         if (copy_from_user(&kvm_sigmask, argp,
3888                                            sizeof(kvm_sigmask)))
3889                                 goto out;
3890                         r = -EINVAL;
3891                         if (kvm_sigmask.len != sizeof(sigset))
3892                                 goto out;
3893                         r = -EFAULT;
3894                         if (copy_from_user(&sigset, sigmask_arg->sigset,
3895                                            sizeof(sigset)))
3896                                 goto out;
3897                         p = &sigset;
3898                 }
3899                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3900                 break;
3901         }
3902         case KVM_GET_FPU: {
3903                 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3904                 r = -ENOMEM;
3905                 if (!fpu)
3906                         goto out;
3907                 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3908                 if (r)
3909                         goto out;
3910                 r = -EFAULT;
3911                 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3912                         goto out;
3913                 r = 0;
3914                 break;
3915         }
3916         case KVM_SET_FPU: {
3917                 fpu = memdup_user(argp, sizeof(*fpu));
3918                 if (IS_ERR(fpu)) {
3919                         r = PTR_ERR(fpu);
3920                         fpu = NULL;
3921                         goto out;
3922                 }
3923                 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3924                 break;
3925         }
3926         case KVM_GET_STATS_FD: {
3927                 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
3928                 break;
3929         }
3930         default:
3931                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3932         }
3933 out:
3934         mutex_unlock(&vcpu->mutex);
3935         kfree(fpu);
3936         kfree(kvm_sregs);
3937         return r;
3938 }
3939
3940 #ifdef CONFIG_KVM_COMPAT
3941 static long kvm_vcpu_compat_ioctl(struct file *filp,
3942                                   unsigned int ioctl, unsigned long arg)
3943 {
3944         struct kvm_vcpu *vcpu = filp->private_data;
3945         void __user *argp = compat_ptr(arg);
3946         int r;
3947
3948         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
3949                 return -EIO;
3950
3951         switch (ioctl) {
3952         case KVM_SET_SIGNAL_MASK: {
3953                 struct kvm_signal_mask __user *sigmask_arg = argp;
3954                 struct kvm_signal_mask kvm_sigmask;
3955                 sigset_t sigset;
3956
3957                 if (argp) {
3958                         r = -EFAULT;
3959                         if (copy_from_user(&kvm_sigmask, argp,
3960                                            sizeof(kvm_sigmask)))
3961                                 goto out;
3962                         r = -EINVAL;
3963                         if (kvm_sigmask.len != sizeof(compat_sigset_t))
3964                                 goto out;
3965                         r = -EFAULT;
3966                         if (get_compat_sigset(&sigset,
3967                                               (compat_sigset_t __user *)sigmask_arg->sigset))
3968                                 goto out;
3969                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3970                 } else
3971                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3972                 break;
3973         }
3974         default:
3975                 r = kvm_vcpu_ioctl(filp, ioctl, arg);
3976         }
3977
3978 out:
3979         return r;
3980 }
3981 #endif
3982
3983 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3984 {
3985         struct kvm_device *dev = filp->private_data;
3986
3987         if (dev->ops->mmap)
3988                 return dev->ops->mmap(dev, vma);
3989
3990         return -ENODEV;
3991 }
3992
3993 static int kvm_device_ioctl_attr(struct kvm_device *dev,
3994                                  int (*accessor)(struct kvm_device *dev,
3995                                                  struct kvm_device_attr *attr),
3996                                  unsigned long arg)
3997 {
3998         struct kvm_device_attr attr;
3999
4000         if (!accessor)
4001                 return -EPERM;
4002
4003         if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4004                 return -EFAULT;
4005
4006         return accessor(dev, &attr);
4007 }
4008
4009 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4010                              unsigned long arg)
4011 {
4012         struct kvm_device *dev = filp->private_data;
4013
4014         if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4015                 return -EIO;
4016
4017         switch (ioctl) {
4018         case KVM_SET_DEVICE_ATTR:
4019                 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4020         case KVM_GET_DEVICE_ATTR:
4021                 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4022         case KVM_HAS_DEVICE_ATTR:
4023                 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4024         default:
4025                 if (dev->ops->ioctl)
4026                         return dev->ops->ioctl(dev, ioctl, arg);
4027
4028                 return -ENOTTY;
4029         }
4030 }
4031
4032 static int kvm_device_release(struct inode *inode, struct file *filp)
4033 {
4034         struct kvm_device *dev = filp->private_data;
4035         struct kvm *kvm = dev->kvm;
4036
4037         if (dev->ops->release) {
4038                 mutex_lock(&kvm->lock);
4039                 list_del(&dev->vm_node);
4040                 dev->ops->release(dev);
4041                 mutex_unlock(&kvm->lock);
4042         }
4043
4044         kvm_put_kvm(kvm);
4045         return 0;
4046 }
4047
4048 static const struct file_operations kvm_device_fops = {
4049         .unlocked_ioctl = kvm_device_ioctl,
4050         .release = kvm_device_release,
4051         KVM_COMPAT(kvm_device_ioctl),
4052         .mmap = kvm_device_mmap,
4053 };
4054
4055 struct kvm_device *kvm_device_from_filp(struct file *filp)
4056 {
4057         if (filp->f_op != &kvm_device_fops)
4058                 return NULL;
4059
4060         return filp->private_data;
4061 }
4062
4063 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4064 #ifdef CONFIG_KVM_MPIC
4065         [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
4066         [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
4067 #endif
4068 };
4069
4070 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4071 {
4072         if (type >= ARRAY_SIZE(kvm_device_ops_table))
4073                 return -ENOSPC;
4074
4075         if (kvm_device_ops_table[type] != NULL)
4076                 return -EEXIST;
4077
4078         kvm_device_ops_table[type] = ops;
4079         return 0;
4080 }
4081
4082 void kvm_unregister_device_ops(u32 type)
4083 {
4084         if (kvm_device_ops_table[type] != NULL)
4085                 kvm_device_ops_table[type] = NULL;
4086 }
4087
4088 static int kvm_ioctl_create_device(struct kvm *kvm,
4089                                    struct kvm_create_device *cd)
4090 {
4091         const struct kvm_device_ops *ops = NULL;
4092         struct kvm_device *dev;
4093         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4094         int type;
4095         int ret;
4096
4097         if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4098                 return -ENODEV;
4099
4100         type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4101         ops = kvm_device_ops_table[type];
4102         if (ops == NULL)
4103                 return -ENODEV;
4104
4105         if (test)
4106                 return 0;
4107
4108         dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4109         if (!dev)
4110                 return -ENOMEM;
4111
4112         dev->ops = ops;
4113         dev->kvm = kvm;
4114
4115         mutex_lock(&kvm->lock);
4116         ret = ops->create(dev, type);
4117         if (ret < 0) {
4118                 mutex_unlock(&kvm->lock);
4119                 kfree(dev);
4120                 return ret;
4121         }
4122         list_add(&dev->vm_node, &kvm->devices);
4123         mutex_unlock(&kvm->lock);
4124
4125         if (ops->init)
4126                 ops->init(dev);
4127
4128         kvm_get_kvm(kvm);
4129         ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4130         if (ret < 0) {
4131                 kvm_put_kvm_no_destroy(kvm);
4132                 mutex_lock(&kvm->lock);
4133                 list_del(&dev->vm_node);
4134                 mutex_unlock(&kvm->lock);
4135                 ops->destroy(dev);
4136                 return ret;
4137         }
4138
4139         cd->fd = ret;
4140         return 0;
4141 }
4142
4143 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4144 {
4145         switch (arg) {
4146         case KVM_CAP_USER_MEMORY:
4147         case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4148         case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4149         case KVM_CAP_INTERNAL_ERROR_DATA:
4150 #ifdef CONFIG_HAVE_KVM_MSI
4151         case KVM_CAP_SIGNAL_MSI:
4152 #endif
4153 #ifdef CONFIG_HAVE_KVM_IRQFD
4154         case KVM_CAP_IRQFD:
4155         case KVM_CAP_IRQFD_RESAMPLE:
4156 #endif
4157         case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4158         case KVM_CAP_CHECK_EXTENSION_VM:
4159         case KVM_CAP_ENABLE_CAP_VM:
4160         case KVM_CAP_HALT_POLL:
4161                 return 1;
4162 #ifdef CONFIG_KVM_MMIO
4163         case KVM_CAP_COALESCED_MMIO:
4164                 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4165         case KVM_CAP_COALESCED_PIO:
4166                 return 1;
4167 #endif
4168 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4169         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4170                 return KVM_DIRTY_LOG_MANUAL_CAPS;
4171 #endif
4172 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4173         case KVM_CAP_IRQ_ROUTING:
4174                 return KVM_MAX_IRQ_ROUTES;
4175 #endif
4176 #if KVM_ADDRESS_SPACE_NUM > 1
4177         case KVM_CAP_MULTI_ADDRESS_SPACE:
4178                 return KVM_ADDRESS_SPACE_NUM;
4179 #endif
4180         case KVM_CAP_NR_MEMSLOTS:
4181                 return KVM_USER_MEM_SLOTS;
4182         case KVM_CAP_DIRTY_LOG_RING:
4183 #if KVM_DIRTY_LOG_PAGE_OFFSET > 0
4184                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4185 #else
4186                 return 0;
4187 #endif
4188         case KVM_CAP_BINARY_STATS_FD:
4189                 return 1;
4190         default:
4191                 break;
4192         }
4193         return kvm_vm_ioctl_check_extension(kvm, arg);
4194 }
4195
4196 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4197 {
4198         int r;
4199
4200         if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4201                 return -EINVAL;
4202
4203         /* the size should be power of 2 */
4204         if (!size || (size & (size - 1)))
4205                 return -EINVAL;
4206
4207         /* Should be bigger to keep the reserved entries, or a page */
4208         if (size < kvm_dirty_ring_get_rsvd_entries() *
4209             sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4210                 return -EINVAL;
4211
4212         if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4213             sizeof(struct kvm_dirty_gfn))
4214                 return -E2BIG;
4215
4216         /* We only allow it to set once */
4217         if (kvm->dirty_ring_size)
4218                 return -EINVAL;
4219
4220         mutex_lock(&kvm->lock);
4221
4222         if (kvm->created_vcpus) {
4223                 /* We don't allow to change this value after vcpu created */
4224                 r = -EINVAL;
4225         } else {
4226                 kvm->dirty_ring_size = size;
4227                 r = 0;
4228         }
4229
4230         mutex_unlock(&kvm->lock);
4231         return r;
4232 }
4233
4234 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4235 {
4236         unsigned long i;
4237         struct kvm_vcpu *vcpu;
4238         int cleared = 0;
4239
4240         if (!kvm->dirty_ring_size)
4241                 return -EINVAL;
4242
4243         mutex_lock(&kvm->slots_lock);
4244
4245         kvm_for_each_vcpu(i, vcpu, kvm)
4246                 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4247
4248         mutex_unlock(&kvm->slots_lock);
4249
4250         if (cleared)
4251                 kvm_flush_remote_tlbs(kvm);
4252
4253         return cleared;
4254 }
4255
4256 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4257                                                   struct kvm_enable_cap *cap)
4258 {
4259         return -EINVAL;
4260 }
4261
4262 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4263                                            struct kvm_enable_cap *cap)
4264 {
4265         switch (cap->cap) {
4266 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4267         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4268                 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4269
4270                 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4271                         allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4272
4273                 if (cap->flags || (cap->args[0] & ~allowed_options))
4274                         return -EINVAL;
4275                 kvm->manual_dirty_log_protect = cap->args[0];
4276                 return 0;
4277         }
4278 #endif
4279         case KVM_CAP_HALT_POLL: {
4280                 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4281                         return -EINVAL;
4282
4283                 kvm->max_halt_poll_ns = cap->args[0];
4284                 return 0;
4285         }
4286         case KVM_CAP_DIRTY_LOG_RING:
4287                 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4288         default:
4289                 return kvm_vm_ioctl_enable_cap(kvm, cap);
4290         }
4291 }
4292
4293 static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4294                               size_t size, loff_t *offset)
4295 {
4296         struct kvm *kvm = file->private_data;
4297
4298         return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4299                                 &kvm_vm_stats_desc[0], &kvm->stat,
4300                                 sizeof(kvm->stat), user_buffer, size, offset);
4301 }
4302
4303 static const struct file_operations kvm_vm_stats_fops = {
4304         .read = kvm_vm_stats_read,
4305         .llseek = noop_llseek,
4306 };
4307
4308 static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4309 {
4310         int fd;
4311         struct file *file;
4312
4313         fd = get_unused_fd_flags(O_CLOEXEC);
4314         if (fd < 0)
4315                 return fd;
4316
4317         file = anon_inode_getfile("kvm-vm-stats",
4318                         &kvm_vm_stats_fops, kvm, O_RDONLY);
4319         if (IS_ERR(file)) {
4320                 put_unused_fd(fd);
4321                 return PTR_ERR(file);
4322         }
4323         file->f_mode |= FMODE_PREAD;
4324         fd_install(fd, file);
4325
4326         return fd;
4327 }
4328
4329 static long kvm_vm_ioctl(struct file *filp,
4330                            unsigned int ioctl, unsigned long arg)
4331 {
4332         struct kvm *kvm = filp->private_data;
4333         void __user *argp = (void __user *)arg;
4334         int r;
4335
4336         if (kvm->mm != current->mm || kvm->vm_dead)
4337                 return -EIO;
4338         switch (ioctl) {
4339         case KVM_CREATE_VCPU:
4340                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4341                 break;
4342         case KVM_ENABLE_CAP: {
4343                 struct kvm_enable_cap cap;
4344
4345                 r = -EFAULT;
4346                 if (copy_from_user(&cap, argp, sizeof(cap)))
4347                         goto out;
4348                 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4349                 break;
4350         }
4351         case KVM_SET_USER_MEMORY_REGION: {
4352                 struct kvm_userspace_memory_region kvm_userspace_mem;
4353
4354                 r = -EFAULT;
4355                 if (copy_from_user(&kvm_userspace_mem, argp,
4356                                                 sizeof(kvm_userspace_mem)))
4357                         goto out;
4358
4359                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4360                 break;
4361         }
4362         case KVM_GET_DIRTY_LOG: {
4363                 struct kvm_dirty_log log;
4364
4365                 r = -EFAULT;
4366                 if (copy_from_user(&log, argp, sizeof(log)))
4367                         goto out;
4368                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4369                 break;
4370         }
4371 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4372         case KVM_CLEAR_DIRTY_LOG: {
4373                 struct kvm_clear_dirty_log log;
4374
4375                 r = -EFAULT;
4376                 if (copy_from_user(&log, argp, sizeof(log)))
4377                         goto out;
4378                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4379                 break;
4380         }
4381 #endif
4382 #ifdef CONFIG_KVM_MMIO
4383         case KVM_REGISTER_COALESCED_MMIO: {
4384                 struct kvm_coalesced_mmio_zone zone;
4385
4386                 r = -EFAULT;
4387                 if (copy_from_user(&zone, argp, sizeof(zone)))
4388                         goto out;
4389                 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4390                 break;
4391         }
4392         case KVM_UNREGISTER_COALESCED_MMIO: {
4393                 struct kvm_coalesced_mmio_zone zone;
4394
4395                 r = -EFAULT;
4396                 if (copy_from_user(&zone, argp, sizeof(zone)))
4397                         goto out;
4398                 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4399                 break;
4400         }
4401 #endif
4402         case KVM_IRQFD: {
4403                 struct kvm_irqfd data;
4404
4405                 r = -EFAULT;
4406                 if (copy_from_user(&data, argp, sizeof(data)))
4407                         goto out;
4408                 r = kvm_irqfd(kvm, &data);
4409                 break;
4410         }
4411         case KVM_IOEVENTFD: {
4412                 struct kvm_ioeventfd data;
4413
4414                 r = -EFAULT;
4415                 if (copy_from_user(&data, argp, sizeof(data)))
4416                         goto out;
4417                 r = kvm_ioeventfd(kvm, &data);
4418                 break;
4419         }
4420 #ifdef CONFIG_HAVE_KVM_MSI
4421         case KVM_SIGNAL_MSI: {
4422                 struct kvm_msi msi;
4423
4424                 r = -EFAULT;
4425                 if (copy_from_user(&msi, argp, sizeof(msi)))
4426                         goto out;
4427                 r = kvm_send_userspace_msi(kvm, &msi);
4428                 break;
4429         }
4430 #endif
4431 #ifdef __KVM_HAVE_IRQ_LINE
4432         case KVM_IRQ_LINE_STATUS:
4433         case KVM_IRQ_LINE: {
4434                 struct kvm_irq_level irq_event;
4435
4436                 r = -EFAULT;
4437                 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4438                         goto out;
4439
4440                 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4441                                         ioctl == KVM_IRQ_LINE_STATUS);
4442                 if (r)
4443                         goto out;
4444
4445                 r = -EFAULT;
4446                 if (ioctl == KVM_IRQ_LINE_STATUS) {
4447                         if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4448                                 goto out;
4449                 }
4450
4451                 r = 0;
4452                 break;
4453         }
4454 #endif
4455 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4456         case KVM_SET_GSI_ROUTING: {
4457                 struct kvm_irq_routing routing;
4458                 struct kvm_irq_routing __user *urouting;
4459                 struct kvm_irq_routing_entry *entries = NULL;
4460
4461                 r = -EFAULT;
4462                 if (copy_from_user(&routing, argp, sizeof(routing)))
4463                         goto out;
4464                 r = -EINVAL;
4465                 if (!kvm_arch_can_set_irq_routing(kvm))
4466                         goto out;
4467                 if (routing.nr > KVM_MAX_IRQ_ROUTES)
4468                         goto out;
4469                 if (routing.flags)
4470                         goto out;
4471                 if (routing.nr) {
4472                         urouting = argp;
4473                         entries = vmemdup_user(urouting->entries,
4474                                                array_size(sizeof(*entries),
4475                                                           routing.nr));
4476                         if (IS_ERR(entries)) {
4477                                 r = PTR_ERR(entries);
4478                                 goto out;
4479                         }
4480                 }
4481                 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4482                                         routing.flags);
4483                 kvfree(entries);
4484                 break;
4485         }
4486 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4487         case KVM_CREATE_DEVICE: {
4488                 struct kvm_create_device cd;
4489
4490                 r = -EFAULT;
4491                 if (copy_from_user(&cd, argp, sizeof(cd)))
4492                         goto out;
4493
4494                 r = kvm_ioctl_create_device(kvm, &cd);
4495                 if (r)
4496                         goto out;
4497
4498                 r = -EFAULT;
4499                 if (copy_to_user(argp, &cd, sizeof(cd)))
4500                         goto out;
4501
4502                 r = 0;
4503                 break;
4504         }
4505         case KVM_CHECK_EXTENSION:
4506                 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4507                 break;
4508         case KVM_RESET_DIRTY_RINGS:
4509                 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4510                 break;
4511         case KVM_GET_STATS_FD:
4512                 r = kvm_vm_ioctl_get_stats_fd(kvm);
4513                 break;
4514         default:
4515                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4516         }
4517 out:
4518         return r;
4519 }
4520
4521 #ifdef CONFIG_KVM_COMPAT
4522 struct compat_kvm_dirty_log {
4523         __u32 slot;
4524         __u32 padding1;
4525         union {
4526                 compat_uptr_t dirty_bitmap; /* one bit per page */
4527                 __u64 padding2;
4528         };
4529 };
4530
4531 struct compat_kvm_clear_dirty_log {
4532         __u32 slot;
4533         __u32 num_pages;
4534         __u64 first_page;
4535         union {
4536                 compat_uptr_t dirty_bitmap; /* one bit per page */
4537                 __u64 padding2;
4538         };
4539 };
4540
4541 static long kvm_vm_compat_ioctl(struct file *filp,
4542                            unsigned int ioctl, unsigned long arg)
4543 {
4544         struct kvm *kvm = filp->private_data;
4545         int r;
4546
4547         if (kvm->mm != current->mm || kvm->vm_dead)
4548                 return -EIO;
4549         switch (ioctl) {
4550 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4551         case KVM_CLEAR_DIRTY_LOG: {
4552                 struct compat_kvm_clear_dirty_log compat_log;
4553                 struct kvm_clear_dirty_log log;
4554
4555                 if (copy_from_user(&compat_log, (void __user *)arg,
4556                                    sizeof(compat_log)))
4557                         return -EFAULT;
4558                 log.slot         = compat_log.slot;
4559                 log.num_pages    = compat_log.num_pages;
4560                 log.first_page   = compat_log.first_page;
4561                 log.padding2     = compat_log.padding2;
4562                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4563
4564                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4565                 break;
4566         }
4567 #endif
4568         case KVM_GET_DIRTY_LOG: {
4569                 struct compat_kvm_dirty_log compat_log;
4570                 struct kvm_dirty_log log;
4571
4572                 if (copy_from_user(&compat_log, (void __user *)arg,
4573                                    sizeof(compat_log)))
4574                         return -EFAULT;
4575                 log.slot         = compat_log.slot;
4576                 log.padding1     = compat_log.padding1;
4577                 log.padding2     = compat_log.padding2;
4578                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4579
4580                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4581                 break;
4582         }
4583         default:
4584                 r = kvm_vm_ioctl(filp, ioctl, arg);
4585         }
4586         return r;
4587 }
4588 #endif
4589
4590 static struct file_operations kvm_vm_fops = {
4591         .release        = kvm_vm_release,
4592         .unlocked_ioctl = kvm_vm_ioctl,
4593         .llseek         = noop_llseek,
4594         KVM_COMPAT(kvm_vm_compat_ioctl),
4595 };
4596
4597 bool file_is_kvm(struct file *file)
4598 {
4599         return file && file->f_op == &kvm_vm_fops;
4600 }
4601 EXPORT_SYMBOL_GPL(file_is_kvm);
4602
4603 static int kvm_dev_ioctl_create_vm(unsigned long type)
4604 {
4605         int r;
4606         struct kvm *kvm;
4607         struct file *file;
4608
4609         kvm = kvm_create_vm(type);
4610         if (IS_ERR(kvm))
4611                 return PTR_ERR(kvm);
4612 #ifdef CONFIG_KVM_MMIO
4613         r = kvm_coalesced_mmio_init(kvm);
4614         if (r < 0)
4615                 goto put_kvm;
4616 #endif
4617         r = get_unused_fd_flags(O_CLOEXEC);
4618         if (r < 0)
4619                 goto put_kvm;
4620
4621         snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4622                         "kvm-%d", task_pid_nr(current));
4623
4624         file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4625         if (IS_ERR(file)) {
4626                 put_unused_fd(r);
4627                 r = PTR_ERR(file);
4628                 goto put_kvm;
4629         }
4630
4631         /*
4632          * Don't call kvm_put_kvm anymore at this point; file->f_op is
4633          * already set, with ->release() being kvm_vm_release().  In error
4634          * cases it will be called by the final fput(file) and will take
4635          * care of doing kvm_put_kvm(kvm).
4636          */
4637         if (kvm_create_vm_debugfs(kvm, r) < 0) {
4638                 put_unused_fd(r);
4639                 fput(file);
4640                 return -ENOMEM;
4641         }
4642         kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4643
4644         fd_install(r, file);
4645         return r;
4646
4647 put_kvm:
4648         kvm_put_kvm(kvm);
4649         return r;
4650 }
4651
4652 static long kvm_dev_ioctl(struct file *filp,
4653                           unsigned int ioctl, unsigned long arg)
4654 {
4655         long r = -EINVAL;
4656
4657         switch (ioctl) {
4658         case KVM_GET_API_VERSION:
4659                 if (arg)
4660                         goto out;
4661                 r = KVM_API_VERSION;
4662                 break;
4663         case KVM_CREATE_VM:
4664                 r = kvm_dev_ioctl_create_vm(arg);
4665                 break;
4666         case KVM_CHECK_EXTENSION:
4667                 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4668                 break;
4669         case KVM_GET_VCPU_MMAP_SIZE:
4670                 if (arg)
4671                         goto out;
4672                 r = PAGE_SIZE;     /* struct kvm_run */
4673 #ifdef CONFIG_X86
4674                 r += PAGE_SIZE;    /* pio data page */
4675 #endif
4676 #ifdef CONFIG_KVM_MMIO
4677                 r += PAGE_SIZE;    /* coalesced mmio ring page */
4678 #endif
4679                 break;
4680         case KVM_TRACE_ENABLE:
4681         case KVM_TRACE_PAUSE:
4682         case KVM_TRACE_DISABLE:
4683                 r = -EOPNOTSUPP;
4684                 break;
4685         default:
4686                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
4687         }
4688 out:
4689         return r;
4690 }
4691
4692 static struct file_operations kvm_chardev_ops = {
4693         .unlocked_ioctl = kvm_dev_ioctl,
4694         .llseek         = noop_llseek,
4695         KVM_COMPAT(kvm_dev_ioctl),
4696 };
4697
4698 static struct miscdevice kvm_dev = {
4699         KVM_MINOR,
4700         "kvm",
4701         &kvm_chardev_ops,
4702 };
4703
4704 static void hardware_enable_nolock(void *junk)
4705 {
4706         int cpu = raw_smp_processor_id();
4707         int r;
4708
4709         if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4710                 return;
4711
4712         cpumask_set_cpu(cpu, cpus_hardware_enabled);
4713
4714         r = kvm_arch_hardware_enable();
4715
4716         if (r) {
4717                 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4718                 atomic_inc(&hardware_enable_failed);
4719                 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4720         }
4721 }
4722
4723 static int kvm_starting_cpu(unsigned int cpu)
4724 {
4725         raw_spin_lock(&kvm_count_lock);
4726         if (kvm_usage_count)
4727                 hardware_enable_nolock(NULL);
4728         raw_spin_unlock(&kvm_count_lock);
4729         return 0;
4730 }
4731
4732 static void hardware_disable_nolock(void *junk)
4733 {
4734         int cpu = raw_smp_processor_id();
4735
4736         if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4737                 return;
4738         cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4739         kvm_arch_hardware_disable();
4740 }
4741
4742 static int kvm_dying_cpu(unsigned int cpu)
4743 {
4744         raw_spin_lock(&kvm_count_lock);
4745         if (kvm_usage_count)
4746                 hardware_disable_nolock(NULL);
4747         raw_spin_unlock(&kvm_count_lock);
4748         return 0;
4749 }
4750
4751 static void hardware_disable_all_nolock(void)
4752 {
4753         BUG_ON(!kvm_usage_count);
4754
4755         kvm_usage_count--;
4756         if (!kvm_usage_count)
4757                 on_each_cpu(hardware_disable_nolock, NULL, 1);
4758 }
4759
4760 static void hardware_disable_all(void)
4761 {
4762         raw_spin_lock(&kvm_count_lock);
4763         hardware_disable_all_nolock();
4764         raw_spin_unlock(&kvm_count_lock);
4765 }
4766
4767 static int hardware_enable_all(void)
4768 {
4769         int r = 0;
4770
4771         raw_spin_lock(&kvm_count_lock);
4772
4773         kvm_usage_count++;
4774         if (kvm_usage_count == 1) {
4775                 atomic_set(&hardware_enable_failed, 0);
4776                 on_each_cpu(hardware_enable_nolock, NULL, 1);
4777
4778                 if (atomic_read(&hardware_enable_failed)) {
4779                         hardware_disable_all_nolock();
4780                         r = -EBUSY;
4781                 }
4782         }
4783
4784         raw_spin_unlock(&kvm_count_lock);
4785
4786         return r;
4787 }
4788
4789 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4790                       void *v)
4791 {
4792         /*
4793          * Some (well, at least mine) BIOSes hang on reboot if
4794          * in vmx root mode.
4795          *
4796          * And Intel TXT required VMX off for all cpu when system shutdown.
4797          */
4798         pr_info("kvm: exiting hardware virtualization\n");
4799         kvm_rebooting = true;
4800         on_each_cpu(hardware_disable_nolock, NULL, 1);
4801         return NOTIFY_OK;
4802 }
4803
4804 static struct notifier_block kvm_reboot_notifier = {
4805         .notifier_call = kvm_reboot,
4806         .priority = 0,
4807 };
4808
4809 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4810 {
4811         int i;
4812
4813         for (i = 0; i < bus->dev_count; i++) {
4814                 struct kvm_io_device *pos = bus->range[i].dev;
4815
4816                 kvm_iodevice_destructor(pos);
4817         }
4818         kfree(bus);
4819 }
4820
4821 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4822                                  const struct kvm_io_range *r2)
4823 {
4824         gpa_t addr1 = r1->addr;
4825         gpa_t addr2 = r2->addr;
4826
4827         if (addr1 < addr2)
4828                 return -1;
4829
4830         /* If r2->len == 0, match the exact address.  If r2->len != 0,
4831          * accept any overlapping write.  Any order is acceptable for
4832          * overlapping ranges, because kvm_io_bus_get_first_dev ensures
4833          * we process all of them.
4834          */
4835         if (r2->len) {
4836                 addr1 += r1->len;
4837                 addr2 += r2->len;
4838         }
4839
4840         if (addr1 > addr2)
4841                 return 1;
4842
4843         return 0;
4844 }
4845
4846 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4847 {
4848         return kvm_io_bus_cmp(p1, p2);
4849 }
4850
4851 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4852                              gpa_t addr, int len)
4853 {
4854         struct kvm_io_range *range, key;
4855         int off;
4856
4857         key = (struct kvm_io_range) {
4858                 .addr = addr,
4859                 .len = len,
4860         };
4861
4862         range = bsearch(&key, bus->range, bus->dev_count,
4863                         sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4864         if (range == NULL)
4865                 return -ENOENT;
4866
4867         off = range - bus->range;
4868
4869         while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4870                 off--;
4871
4872         return off;
4873 }
4874
4875 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4876                               struct kvm_io_range *range, const void *val)
4877 {
4878         int idx;
4879
4880         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4881         if (idx < 0)
4882                 return -EOPNOTSUPP;
4883
4884         while (idx < bus->dev_count &&
4885                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4886                 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
4887                                         range->len, val))
4888                         return idx;
4889                 idx++;
4890         }
4891
4892         return -EOPNOTSUPP;
4893 }
4894
4895 /* kvm_io_bus_write - called under kvm->slots_lock */
4896 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4897                      int len, const void *val)
4898 {
4899         struct kvm_io_bus *bus;
4900         struct kvm_io_range range;
4901         int r;
4902
4903         range = (struct kvm_io_range) {
4904                 .addr = addr,
4905                 .len = len,
4906         };
4907
4908         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4909         if (!bus)
4910                 return -ENOMEM;
4911         r = __kvm_io_bus_write(vcpu, bus, &range, val);
4912         return r < 0 ? r : 0;
4913 }
4914 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
4915
4916 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
4917 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4918                             gpa_t addr, int len, const void *val, long cookie)
4919 {
4920         struct kvm_io_bus *bus;
4921         struct kvm_io_range range;
4922
4923         range = (struct kvm_io_range) {
4924                 .addr = addr,
4925                 .len = len,
4926         };
4927
4928         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4929         if (!bus)
4930                 return -ENOMEM;
4931
4932         /* First try the device referenced by cookie. */
4933         if ((cookie >= 0) && (cookie < bus->dev_count) &&
4934             (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4935                 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4936                                         val))
4937                         return cookie;
4938
4939         /*
4940          * cookie contained garbage; fall back to search and return the
4941          * correct cookie value.
4942          */
4943         return __kvm_io_bus_write(vcpu, bus, &range, val);
4944 }
4945
4946 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4947                              struct kvm_io_range *range, void *val)
4948 {
4949         int idx;
4950
4951         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4952         if (idx < 0)
4953                 return -EOPNOTSUPP;
4954
4955         while (idx < bus->dev_count &&
4956                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4957                 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4958                                        range->len, val))
4959                         return idx;
4960                 idx++;
4961         }
4962
4963         return -EOPNOTSUPP;
4964 }
4965
4966 /* kvm_io_bus_read - called under kvm->slots_lock */
4967 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4968                     int len, void *val)
4969 {
4970         struct kvm_io_bus *bus;
4971         struct kvm_io_range range;
4972         int r;
4973
4974         range = (struct kvm_io_range) {
4975                 .addr = addr,
4976                 .len = len,
4977         };
4978
4979         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4980         if (!bus)
4981                 return -ENOMEM;
4982         r = __kvm_io_bus_read(vcpu, bus, &range, val);
4983         return r < 0 ? r : 0;
4984 }
4985
4986 /* Caller must hold slots_lock. */
4987 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4988                             int len, struct kvm_io_device *dev)
4989 {
4990         int i;
4991         struct kvm_io_bus *new_bus, *bus;
4992         struct kvm_io_range range;
4993
4994         bus = kvm_get_bus(kvm, bus_idx);
4995         if (!bus)
4996                 return -ENOMEM;
4997
4998         /* exclude ioeventfd which is limited by maximum fd */
4999         if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5000                 return -ENOSPC;
5001
5002         new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5003                           GFP_KERNEL_ACCOUNT);
5004         if (!new_bus)
5005                 return -ENOMEM;
5006
5007         range = (struct kvm_io_range) {
5008                 .addr = addr,
5009                 .len = len,
5010                 .dev = dev,
5011         };
5012
5013         for (i = 0; i < bus->dev_count; i++)
5014                 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5015                         break;
5016
5017         memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5018         new_bus->dev_count++;
5019         new_bus->range[i] = range;
5020         memcpy(new_bus->range + i + 1, bus->range + i,
5021                 (bus->dev_count - i) * sizeof(struct kvm_io_range));
5022         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5023         synchronize_srcu_expedited(&kvm->srcu);
5024         kfree(bus);
5025
5026         return 0;
5027 }
5028
5029 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5030                               struct kvm_io_device *dev)
5031 {
5032         int i, j;
5033         struct kvm_io_bus *new_bus, *bus;
5034
5035         lockdep_assert_held(&kvm->slots_lock);
5036
5037         bus = kvm_get_bus(kvm, bus_idx);
5038         if (!bus)
5039                 return 0;
5040
5041         for (i = 0; i < bus->dev_count; i++) {
5042                 if (bus->range[i].dev == dev) {
5043                         break;
5044                 }
5045         }
5046
5047         if (i == bus->dev_count)
5048                 return 0;
5049
5050         new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5051                           GFP_KERNEL_ACCOUNT);
5052         if (new_bus) {
5053                 memcpy(new_bus, bus, struct_size(bus, range, i));
5054                 new_bus->dev_count--;
5055                 memcpy(new_bus->range + i, bus->range + i + 1,
5056                                 flex_array_size(new_bus, range, new_bus->dev_count - i));
5057         }
5058
5059         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5060         synchronize_srcu_expedited(&kvm->srcu);
5061
5062         /* Destroy the old bus _after_ installing the (null) bus. */
5063         if (!new_bus) {
5064                 pr_err("kvm: failed to shrink bus, removing it completely\n");
5065                 for (j = 0; j < bus->dev_count; j++) {
5066                         if (j == i)
5067                                 continue;
5068                         kvm_iodevice_destructor(bus->range[j].dev);
5069                 }
5070         }
5071
5072         kfree(bus);
5073         return new_bus ? 0 : -ENOMEM;
5074 }
5075
5076 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5077                                          gpa_t addr)
5078 {
5079         struct kvm_io_bus *bus;
5080         int dev_idx, srcu_idx;
5081         struct kvm_io_device *iodev = NULL;
5082
5083         srcu_idx = srcu_read_lock(&kvm->srcu);
5084
5085         bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5086         if (!bus)
5087                 goto out_unlock;
5088
5089         dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5090         if (dev_idx < 0)
5091                 goto out_unlock;
5092
5093         iodev = bus->range[dev_idx].dev;
5094
5095 out_unlock:
5096         srcu_read_unlock(&kvm->srcu, srcu_idx);
5097
5098         return iodev;
5099 }
5100 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5101
5102 static int kvm_debugfs_open(struct inode *inode, struct file *file,
5103                            int (*get)(void *, u64 *), int (*set)(void *, u64),
5104                            const char *fmt)
5105 {
5106         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5107                                           inode->i_private;
5108
5109         /*
5110          * The debugfs files are a reference to the kvm struct which
5111         * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5112         * avoids the race between open and the removal of the debugfs directory.
5113          */
5114         if (!kvm_get_kvm_safe(stat_data->kvm))
5115                 return -ENOENT;
5116
5117         if (simple_attr_open(inode, file, get,
5118                     kvm_stats_debugfs_mode(stat_data->desc) & 0222
5119                     ? set : NULL,
5120                     fmt)) {
5121                 kvm_put_kvm(stat_data->kvm);
5122                 return -ENOMEM;
5123         }
5124
5125         return 0;
5126 }
5127
5128 static int kvm_debugfs_release(struct inode *inode, struct file *file)
5129 {
5130         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5131                                           inode->i_private;
5132
5133         simple_attr_release(inode, file);
5134         kvm_put_kvm(stat_data->kvm);
5135
5136         return 0;
5137 }
5138
5139 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5140 {
5141         *val = *(u64 *)((void *)(&kvm->stat) + offset);
5142
5143         return 0;
5144 }
5145
5146 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5147 {
5148         *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5149
5150         return 0;
5151 }
5152
5153 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5154 {
5155         unsigned long i;
5156         struct kvm_vcpu *vcpu;
5157
5158         *val = 0;
5159
5160         kvm_for_each_vcpu(i, vcpu, kvm)
5161                 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5162
5163         return 0;
5164 }
5165
5166 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5167 {
5168         unsigned long i;
5169         struct kvm_vcpu *vcpu;
5170
5171         kvm_for_each_vcpu(i, vcpu, kvm)
5172                 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5173
5174         return 0;
5175 }
5176
5177 static int kvm_stat_data_get(void *data, u64 *val)
5178 {
5179         int r = -EFAULT;
5180         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5181
5182         switch (stat_data->kind) {
5183         case KVM_STAT_VM:
5184                 r = kvm_get_stat_per_vm(stat_data->kvm,
5185                                         stat_data->desc->desc.offset, val);
5186                 break;
5187         case KVM_STAT_VCPU:
5188                 r = kvm_get_stat_per_vcpu(stat_data->kvm,
5189                                           stat_data->desc->desc.offset, val);
5190                 break;
5191         }
5192
5193         return r;
5194 }
5195
5196 static int kvm_stat_data_clear(void *data, u64 val)
5197 {
5198         int r = -EFAULT;
5199         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5200
5201         if (val)
5202                 return -EINVAL;
5203
5204         switch (stat_data->kind) {
5205         case KVM_STAT_VM:
5206                 r = kvm_clear_stat_per_vm(stat_data->kvm,
5207                                           stat_data->desc->desc.offset);
5208                 break;
5209         case KVM_STAT_VCPU:
5210                 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5211                                             stat_data->desc->desc.offset);
5212                 break;
5213         }
5214
5215         return r;
5216 }
5217
5218 static int kvm_stat_data_open(struct inode *inode, struct file *file)
5219 {
5220         __simple_attr_check_format("%llu\n", 0ull);
5221         return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5222                                 kvm_stat_data_clear, "%llu\n");
5223 }
5224
5225 static const struct file_operations stat_fops_per_vm = {
5226         .owner = THIS_MODULE,
5227         .open = kvm_stat_data_open,
5228         .release = kvm_debugfs_release,
5229         .read = simple_attr_read,
5230         .write = simple_attr_write,
5231         .llseek = no_llseek,
5232 };
5233
5234 static int vm_stat_get(void *_offset, u64 *val)
5235 {
5236         unsigned offset = (long)_offset;
5237         struct kvm *kvm;
5238         u64 tmp_val;
5239
5240         *val = 0;
5241         mutex_lock(&kvm_lock);
5242         list_for_each_entry(kvm, &vm_list, vm_list) {
5243                 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5244                 *val += tmp_val;
5245         }
5246         mutex_unlock(&kvm_lock);
5247         return 0;
5248 }
5249
5250 static int vm_stat_clear(void *_offset, u64 val)
5251 {
5252         unsigned offset = (long)_offset;
5253         struct kvm *kvm;
5254
5255         if (val)
5256                 return -EINVAL;
5257
5258         mutex_lock(&kvm_lock);
5259         list_for_each_entry(kvm, &vm_list, vm_list) {
5260                 kvm_clear_stat_per_vm(kvm, offset);
5261         }
5262         mutex_unlock(&kvm_lock);
5263
5264         return 0;
5265 }
5266
5267 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5268 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5269
5270 static int vcpu_stat_get(void *_offset, u64 *val)
5271 {
5272         unsigned offset = (long)_offset;
5273         struct kvm *kvm;
5274         u64 tmp_val;
5275
5276         *val = 0;
5277         mutex_lock(&kvm_lock);
5278         list_for_each_entry(kvm, &vm_list, vm_list) {
5279                 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5280                 *val += tmp_val;
5281         }
5282         mutex_unlock(&kvm_lock);
5283         return 0;
5284 }
5285
5286 static int vcpu_stat_clear(void *_offset, u64 val)
5287 {
5288         unsigned offset = (long)_offset;
5289         struct kvm *kvm;
5290
5291         if (val)
5292                 return -EINVAL;
5293
5294         mutex_lock(&kvm_lock);
5295         list_for_each_entry(kvm, &vm_list, vm_list) {
5296                 kvm_clear_stat_per_vcpu(kvm, offset);
5297         }
5298         mutex_unlock(&kvm_lock);
5299
5300         return 0;
5301 }
5302
5303 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5304                         "%llu\n");
5305 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5306
5307 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5308 {
5309         struct kobj_uevent_env *env;
5310         unsigned long long created, active;
5311
5312         if (!kvm_dev.this_device || !kvm)
5313                 return;
5314
5315         mutex_lock(&kvm_lock);
5316         if (type == KVM_EVENT_CREATE_VM) {
5317                 kvm_createvm_count++;
5318                 kvm_active_vms++;
5319         } else if (type == KVM_EVENT_DESTROY_VM) {
5320                 kvm_active_vms--;
5321         }
5322         created = kvm_createvm_count;
5323         active = kvm_active_vms;
5324         mutex_unlock(&kvm_lock);
5325
5326         env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5327         if (!env)
5328                 return;
5329
5330         add_uevent_var(env, "CREATED=%llu", created);
5331         add_uevent_var(env, "COUNT=%llu", active);
5332
5333         if (type == KVM_EVENT_CREATE_VM) {
5334                 add_uevent_var(env, "EVENT=create");
5335                 kvm->userspace_pid = task_pid_nr(current);
5336         } else if (type == KVM_EVENT_DESTROY_VM) {
5337                 add_uevent_var(env, "EVENT=destroy");
5338         }
5339         add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5340
5341         if (kvm->debugfs_dentry) {
5342                 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5343
5344                 if (p) {
5345                         tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5346                         if (!IS_ERR(tmp))
5347                                 add_uevent_var(env, "STATS_PATH=%s", tmp);
5348                         kfree(p);
5349                 }
5350         }
5351         /* no need for checks, since we are adding at most only 5 keys */
5352         env->envp[env->envp_idx++] = NULL;
5353         kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5354         kfree(env);
5355 }
5356
5357 static void kvm_init_debug(void)
5358 {
5359         const struct file_operations *fops;
5360         const struct _kvm_stats_desc *pdesc;
5361         int i;
5362
5363         kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5364
5365         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5366                 pdesc = &kvm_vm_stats_desc[i];
5367                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5368                         fops = &vm_stat_fops;
5369                 else
5370                         fops = &vm_stat_readonly_fops;
5371                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5372                                 kvm_debugfs_dir,
5373                                 (void *)(long)pdesc->desc.offset, fops);
5374         }
5375
5376         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5377                 pdesc = &kvm_vcpu_stats_desc[i];
5378                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5379                         fops = &vcpu_stat_fops;
5380                 else
5381                         fops = &vcpu_stat_readonly_fops;
5382                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5383                                 kvm_debugfs_dir,
5384                                 (void *)(long)pdesc->desc.offset, fops);
5385         }
5386 }
5387
5388 static int kvm_suspend(void)
5389 {
5390         if (kvm_usage_count)
5391                 hardware_disable_nolock(NULL);
5392         return 0;
5393 }
5394
5395 static void kvm_resume(void)
5396 {
5397         if (kvm_usage_count) {
5398 #ifdef CONFIG_LOCKDEP
5399                 WARN_ON(lockdep_is_held(&kvm_count_lock));
5400 #endif
5401                 hardware_enable_nolock(NULL);
5402         }
5403 }
5404
5405 static struct syscore_ops kvm_syscore_ops = {
5406         .suspend = kvm_suspend,
5407         .resume = kvm_resume,
5408 };
5409
5410 static inline
5411 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5412 {
5413         return container_of(pn, struct kvm_vcpu, preempt_notifier);
5414 }
5415
5416 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5417 {
5418         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5419
5420         WRITE_ONCE(vcpu->preempted, false);
5421         WRITE_ONCE(vcpu->ready, false);
5422
5423         __this_cpu_write(kvm_running_vcpu, vcpu);
5424         kvm_arch_sched_in(vcpu, cpu);
5425         kvm_arch_vcpu_load(vcpu, cpu);
5426 }
5427
5428 static void kvm_sched_out(struct preempt_notifier *pn,
5429                           struct task_struct *next)
5430 {
5431         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5432
5433         if (current->on_rq) {
5434                 WRITE_ONCE(vcpu->preempted, true);
5435                 WRITE_ONCE(vcpu->ready, true);
5436         }
5437         kvm_arch_vcpu_put(vcpu);
5438         __this_cpu_write(kvm_running_vcpu, NULL);
5439 }
5440
5441 /**
5442  * kvm_get_running_vcpu - get the vcpu running on the current CPU.
5443  *
5444  * We can disable preemption locally around accessing the per-CPU variable,
5445  * and use the resolved vcpu pointer after enabling preemption again,
5446  * because even if the current thread is migrated to another CPU, reading
5447  * the per-CPU value later will give us the same value as we update the
5448  * per-CPU variable in the preempt notifier handlers.
5449  */
5450 struct kvm_vcpu *kvm_get_running_vcpu(void)
5451 {
5452         struct kvm_vcpu *vcpu;
5453
5454         preempt_disable();
5455         vcpu = __this_cpu_read(kvm_running_vcpu);
5456         preempt_enable();
5457
5458         return vcpu;
5459 }
5460 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5461
5462 /**
5463  * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
5464  */
5465 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5466 {
5467         return &kvm_running_vcpu;
5468 }
5469
5470 struct kvm_cpu_compat_check {
5471         void *opaque;
5472         int *ret;
5473 };
5474
5475 static void check_processor_compat(void *data)
5476 {
5477         struct kvm_cpu_compat_check *c = data;
5478
5479         *c->ret = kvm_arch_check_processor_compat(c->opaque);
5480 }
5481
5482 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
5483                   struct module *module)
5484 {
5485         struct kvm_cpu_compat_check c;
5486         int r;
5487         int cpu;
5488
5489         r = kvm_arch_init(opaque);
5490         if (r)
5491                 goto out_fail;
5492
5493         /*
5494          * kvm_arch_init makes sure there's at most one caller
5495          * for architectures that support multiple implementations,
5496          * like intel and amd on x86.
5497          * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
5498          * conflicts in case kvm is already setup for another implementation.
5499          */
5500         r = kvm_irqfd_init();
5501         if (r)
5502                 goto out_irqfd;
5503
5504         if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
5505                 r = -ENOMEM;
5506                 goto out_free_0;
5507         }
5508
5509         r = kvm_arch_hardware_setup(opaque);
5510         if (r < 0)
5511                 goto out_free_1;
5512
5513         c.ret = &r;
5514         c.opaque = opaque;
5515         for_each_online_cpu(cpu) {
5516                 smp_call_function_single(cpu, check_processor_compat, &c, 1);
5517                 if (r < 0)
5518                         goto out_free_2;
5519         }
5520
5521         r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
5522                                       kvm_starting_cpu, kvm_dying_cpu);
5523         if (r)
5524                 goto out_free_2;
5525         register_reboot_notifier(&kvm_reboot_notifier);
5526
5527         /* A kmem cache lets us meet the alignment requirements of fx_save. */
5528         if (!vcpu_align)
5529                 vcpu_align = __alignof__(struct kvm_vcpu);
5530         kvm_vcpu_cache =
5531                 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5532                                            SLAB_ACCOUNT,
5533                                            offsetof(struct kvm_vcpu, arch),
5534                                            offsetofend(struct kvm_vcpu, stats_id)
5535                                            - offsetof(struct kvm_vcpu, arch),
5536                                            NULL);
5537         if (!kvm_vcpu_cache) {
5538                 r = -ENOMEM;
5539                 goto out_free_3;
5540         }
5541
5542         for_each_possible_cpu(cpu) {
5543                 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
5544                                             GFP_KERNEL, cpu_to_node(cpu))) {
5545                         r = -ENOMEM;
5546                         goto out_free_4;
5547                 }
5548         }
5549
5550         r = kvm_async_pf_init();
5551         if (r)
5552                 goto out_free_5;
5553
5554         kvm_chardev_ops.owner = module;
5555         kvm_vm_fops.owner = module;
5556         kvm_vcpu_fops.owner = module;
5557
5558         r = misc_register(&kvm_dev);
5559         if (r) {
5560                 pr_err("kvm: misc device register failed\n");
5561                 goto out_unreg;
5562         }
5563
5564         register_syscore_ops(&kvm_syscore_ops);
5565
5566         kvm_preempt_ops.sched_in = kvm_sched_in;
5567         kvm_preempt_ops.sched_out = kvm_sched_out;
5568
5569         kvm_init_debug();
5570
5571         r = kvm_vfio_ops_init();
5572         WARN_ON(r);
5573
5574         return 0;
5575
5576 out_unreg:
5577         kvm_async_pf_deinit();
5578 out_free_5:
5579         for_each_possible_cpu(cpu)
5580                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5581 out_free_4:
5582         kmem_cache_destroy(kvm_vcpu_cache);
5583 out_free_3:
5584         unregister_reboot_notifier(&kvm_reboot_notifier);
5585         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5586 out_free_2:
5587         kvm_arch_hardware_unsetup();
5588 out_free_1:
5589         free_cpumask_var(cpus_hardware_enabled);
5590 out_free_0:
5591         kvm_irqfd_exit();
5592 out_irqfd:
5593         kvm_arch_exit();
5594 out_fail:
5595         return r;
5596 }
5597 EXPORT_SYMBOL_GPL(kvm_init);
5598
5599 void kvm_exit(void)
5600 {
5601         int cpu;
5602
5603         debugfs_remove_recursive(kvm_debugfs_dir);
5604         misc_deregister(&kvm_dev);
5605         for_each_possible_cpu(cpu)
5606                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5607         kmem_cache_destroy(kvm_vcpu_cache);
5608         kvm_async_pf_deinit();
5609         unregister_syscore_ops(&kvm_syscore_ops);
5610         unregister_reboot_notifier(&kvm_reboot_notifier);
5611         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5612         on_each_cpu(hardware_disable_nolock, NULL, 1);
5613         kvm_arch_hardware_unsetup();
5614         kvm_arch_exit();
5615         kvm_irqfd_exit();
5616         free_cpumask_var(cpus_hardware_enabled);
5617         kvm_vfio_ops_exit();
5618 }
5619 EXPORT_SYMBOL_GPL(kvm_exit);
5620
5621 struct kvm_vm_worker_thread_context {
5622         struct kvm *kvm;
5623         struct task_struct *parent;
5624         struct completion init_done;
5625         kvm_vm_thread_fn_t thread_fn;
5626         uintptr_t data;
5627         int err;
5628 };
5629
5630 static int kvm_vm_worker_thread(void *context)
5631 {
5632         /*
5633          * The init_context is allocated on the stack of the parent thread, so
5634          * we have to locally copy anything that is needed beyond initialization
5635          */
5636         struct kvm_vm_worker_thread_context *init_context = context;
5637         struct kvm *kvm = init_context->kvm;
5638         kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5639         uintptr_t data = init_context->data;
5640         int err;
5641
5642         err = kthread_park(current);
5643         /* kthread_park(current) is never supposed to return an error */
5644         WARN_ON(err != 0);
5645         if (err)
5646                 goto init_complete;
5647
5648         err = cgroup_attach_task_all(init_context->parent, current);
5649         if (err) {
5650                 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5651                         __func__, err);
5652                 goto init_complete;
5653         }
5654
5655         set_user_nice(current, task_nice(init_context->parent));
5656
5657 init_complete:
5658         init_context->err = err;
5659         complete(&init_context->init_done);
5660         init_context = NULL;
5661
5662         if (err)
5663                 return err;
5664
5665         /* Wait to be woken up by the spawner before proceeding. */
5666         kthread_parkme();
5667
5668         if (!kthread_should_stop())
5669                 err = thread_fn(kvm, data);
5670
5671         return err;
5672 }
5673
5674 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5675                                 uintptr_t data, const char *name,
5676                                 struct task_struct **thread_ptr)
5677 {
5678         struct kvm_vm_worker_thread_context init_context = {};
5679         struct task_struct *thread;
5680
5681         *thread_ptr = NULL;
5682         init_context.kvm = kvm;
5683         init_context.parent = current;
5684         init_context.thread_fn = thread_fn;
5685         init_context.data = data;
5686         init_completion(&init_context.init_done);
5687
5688         thread = kthread_run(kvm_vm_worker_thread, &init_context,
5689                              "%s-%d", name, task_pid_nr(current));
5690         if (IS_ERR(thread))
5691                 return PTR_ERR(thread);
5692
5693         /* kthread_run is never supposed to return NULL */
5694         WARN_ON(thread == NULL);
5695
5696         wait_for_completion(&init_context.init_done);
5697
5698         if (!init_context.err)
5699                 *thread_ptr = thread;
5700
5701         return init_context.err;
5702 }