arch/arm/kvm/mmu.c

   1 /*
   2  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
   3  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License, version 2, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17  */
  18
  19 #include <linux/mman.h>
  20 #include <linux/kvm_host.h>
  21 #include <linux/io.h>
  22 #include <linux/hugetlb.h>
  23 #include <trace/events/kvm.h>
  24 #include <asm/pgalloc.h>
  25 #include <asm/cacheflush.h>
  26 #include <asm/kvm_arm.h>
  27 #include <asm/kvm_mmu.h>
  28 #include <asm/kvm_mmio.h>
  29 #include <asm/kvm_asm.h>
  30 #include <asm/kvm_emulate.h>
  31
  32 #include "trace.h"
  33
  34 extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
  35
  36 static pgd_t *boot_hyp_pgd;
  37 static pgd_t *hyp_pgd;
  38 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
  39
  40 static void *init_bounce_page;
  41 static unsigned long hyp_idmap_start;
  42 static unsigned long hyp_idmap_end;
  43 static phys_addr_t hyp_idmap_vector;
  44
  45 #define pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
  46
  47 #define kvm_pmd_huge(_x)        (pmd_huge(_x) || pmd_trans_huge(_x))
  48
  49 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
  50 {
  51         /*
  52          * This function also gets called when dealing with HYP page
  53          * tables. As HYP doesn't have an associated struct kvm (and
  54          * the HYP page tables are fairly static), we don't do
  55          * anything there.
  56          */
  57         if (kvm)
  58                 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
  59 }
  60
  61 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  62                                   int min, int max)
  63 {
  64         void *page;
  65
  66         BUG_ON(max > KVM_NR_MEM_OBJS);
  67         if (cache->nobjs >= min)
  68                 return 0;
  69         while (cache->nobjs < max) {
  70                 page = (void *)__get_free_page(PGALLOC_GFP);
  71                 if (!page)
  72                         return -ENOMEM;
  73                 cache->objects[cache->nobjs++] = page;
  74         }
  75         return 0;
  76 }
  77
  78 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
  79 {
  80         while (mc->nobjs)
  81                 free_page((unsigned long)mc->objects[--mc->nobjs]);
  82 }
  83
  84 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
  85 {
  86         void *p;
  87
  88         BUG_ON(!mc || !mc->nobjs);
  89         p = mc->objects[--mc->nobjs];
  90         return p;
  91 }
  92
  93 static bool page_empty(void *ptr)
  94 {
  95         struct page *ptr_page = virt_to_page(ptr);
  96         return page_count(ptr_page) == 1;
  97 }
  98
  99 static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
 100 {
 101         if (pud_huge(*pud)) {
 102                 pud_clear(pud);
 103                 kvm_tlb_flush_vmid_ipa(kvm, addr);
 104         } else {
 105                 pmd_t *pmd_table = pmd_offset(pud, 0);
 106                 pud_clear(pud);
 107                 kvm_tlb_flush_vmid_ipa(kvm, addr);
 108                 pmd_free(NULL, pmd_table);
 109         }
 110         put_page(virt_to_page(pud));
 111 }
 112
 113 static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
 114 {
 115         if (kvm_pmd_huge(*pmd)) {
 116                 pmd_clear(pmd);
 117                 kvm_tlb_flush_vmid_ipa(kvm, addr);
 118         } else {
 119                 pte_t *pte_table = pte_offset_kernel(pmd, 0);
 120                 pmd_clear(pmd);
 121                 kvm_tlb_flush_vmid_ipa(kvm, addr);
 122                 pte_free_kernel(NULL, pte_table);
 123         }
 124         put_page(virt_to_page(pmd));
 125 }
 126
 127 static void clear_pte_entry(struct kvm *kvm, pte_t *pte, phys_addr_t addr)
 128 {
 129         if (pte_present(*pte)) {
 130                 kvm_set_pte(pte, __pte(0));
 131                 put_page(virt_to_page(pte));
 132                 kvm_tlb_flush_vmid_ipa(kvm, addr);
 133         }
 134 }
 135
 136 static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
 137                         unsigned long long start, u64 size)
 138 {
 139         pgd_t *pgd;
 140         pud_t *pud;
 141         pmd_t *pmd;
 142         pte_t *pte;
 143         unsigned long long addr = start, end = start + size;
 144         u64 next;
 145
 146         while (addr < end) {
 147                 pgd = pgdp + pgd_index(addr);
 148                 pud = pud_offset(pgd, addr);
 149                 if (pud_none(*pud)) {
 150                         addr = pud_addr_end(addr, end);
 151                         continue;
 152                 }
 153
 154                 if (pud_huge(*pud)) {
 155                         /*
 156                          * If we are dealing with a huge pud, just clear it and
 157                          * move on.
 158                          */
 159                         clear_pud_entry(kvm, pud, addr);
 160                         addr = pud_addr_end(addr, end);
 161                         continue;
 162                 }
 163
 164                 pmd = pmd_offset(pud, addr);
 165                 if (pmd_none(*pmd)) {
 166                         addr = pmd_addr_end(addr, end);
 167                         continue;
 168                 }
 169
 170                 if (!kvm_pmd_huge(*pmd)) {
 171                         pte = pte_offset_kernel(pmd, addr);
 172                         clear_pte_entry(kvm, pte, addr);
 173                         next = addr + PAGE_SIZE;
 174                 }
 175
 176                 /*
 177                  * If the pmd entry is to be cleared, walk back up the ladder
 178                  */
 179                 if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
 180                         clear_pmd_entry(kvm, pmd, addr);
 181                         next = pmd_addr_end(addr, end);
 182                         if (page_empty(pmd) && !page_empty(pud)) {
 183                                 clear_pud_entry(kvm, pud, addr);
 184                                 next = pud_addr_end(addr, end);
 185                         }
 186                 }
 187
 188                 addr = next;
 189         }
 190 }
 191
 192 /**
 193  * free_boot_hyp_pgd - free HYP boot page tables
 194  *
 195  * Free the HYP boot page tables. The bounce page is also freed.
 196  */
 197 void free_boot_hyp_pgd(void)
 198 {
 199         mutex_lock(&kvm_hyp_pgd_mutex);
 200
 201         if (boot_hyp_pgd) {
 202                 unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
 203                 unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
 204                 free_pages((unsigned long)boot_hyp_pgd, pgd_order);
 205                 boot_hyp_pgd = NULL;
 206         }
 207
 208         if (hyp_pgd)
 209                 unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
 210
 211         free_page((unsigned long)init_bounce_page);
 212         init_bounce_page = NULL;
 213
 214         mutex_unlock(&kvm_hyp_pgd_mutex);
 215 }
 216
 217 /**
 218  * free_hyp_pgds - free Hyp-mode page tables
 219  *
 220  * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
 221  * therefore contains either mappings in the kernel memory area (above
 222  * PAGE_OFFSET), or device mappings in the vmalloc range (from
 223  * VMALLOC_START to VMALLOC_END).
 224  *
 225  * boot_hyp_pgd should only map two pages for the init code.
 226  */
 227 void free_hyp_pgds(void)
 228 {
 229         unsigned long addr;
 230
 231         free_boot_hyp_pgd();
 232
 233         mutex_lock(&kvm_hyp_pgd_mutex);
 234
 235         if (hyp_pgd) {
 236                 for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
 237                         unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
 238                 for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
 239                         unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
 240
 241                 free_pages((unsigned long)hyp_pgd, pgd_order);
 242                 hyp_pgd = NULL;
 243         }
 244
 245         mutex_unlock(&kvm_hyp_pgd_mutex);
 246 }
 247
 248 static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
 249                                     unsigned long end, unsigned long pfn,
 250                                     pgprot_t prot)
 251 {
 252         pte_t *pte;
 253         unsigned long addr;
 254
 255         addr = start;
 256         do {
 257                 pte = pte_offset_kernel(pmd, addr);
 258                 kvm_set_pte(pte, pfn_pte(pfn, prot));
 259                 get_page(virt_to_page(pte));
 260                 kvm_flush_dcache_to_poc(pte, sizeof(*pte));
 261                 pfn++;
 262         } while (addr += PAGE_SIZE, addr != end);
 263 }
 264
 265 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
 266                                    unsigned long end, unsigned long pfn,
 267                                    pgprot_t prot)
 268 {
 269         pmd_t *pmd;
 270         pte_t *pte;
 271         unsigned long addr, next;
 272
 273         addr = start;
 274         do {
 275                 pmd = pmd_offset(pud, addr);
 276
 277                 BUG_ON(pmd_sect(*pmd));
 278
 279                 if (pmd_none(*pmd)) {
 280                         pte = pte_alloc_one_kernel(NULL, addr);
 281                         if (!pte) {
 282                                 kvm_err("Cannot allocate Hyp pte\n");
 283                                 return -ENOMEM;
 284                         }
 285                         pmd_populate_kernel(NULL, pmd, pte);
 286                         get_page(virt_to_page(pmd));
 287                         kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
 288                 }
 289
 290                 next = pmd_addr_end(addr, end);
 291
 292                 create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
 293                 pfn += (next - addr) >> PAGE_SHIFT;
 294         } while (addr = next, addr != end);
 295
 296         return 0;
 297 }
 298
 299 static int __create_hyp_mappings(pgd_t *pgdp,
 300                                  unsigned long start, unsigned long end,
 301                                  unsigned long pfn, pgprot_t prot)
 302 {
 303         pgd_t *pgd;
 304         pud_t *pud;
 305         pmd_t *pmd;
 306         unsigned long addr, next;
 307         int err = 0;
 308
 309         mutex_lock(&kvm_hyp_pgd_mutex);
 310         addr = start & PAGE_MASK;
 311         end = PAGE_ALIGN(end);
 312         do {
 313                 pgd = pgdp + pgd_index(addr);
 314                 pud = pud_offset(pgd, addr);
 315
 316                 if (pud_none_or_clear_bad(pud)) {
 317                         pmd = pmd_alloc_one(NULL, addr);
 318                         if (!pmd) {
 319                                 kvm_err("Cannot allocate Hyp pmd\n");
 320                                 err = -ENOMEM;
 321                                 goto out;
 322                         }
 323                         pud_populate(NULL, pud, pmd);
 324                         get_page(virt_to_page(pud));
 325                         kvm_flush_dcache_to_poc(pud, sizeof(*pud));
 326                 }
 327
 328                 next = pgd_addr_end(addr, end);
 329                 err = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
 330                 if (err)
 331                         goto out;
 332                 pfn += (next - addr) >> PAGE_SHIFT;
 333         } while (addr = next, addr != end);
 334 out:
 335         mutex_unlock(&kvm_hyp_pgd_mutex);
 336         return err;
 337 }
 338
 339 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
 340 {
 341         if (!is_vmalloc_addr(kaddr)) {
 342                 BUG_ON(!virt_addr_valid(kaddr));
 343                 return __pa(kaddr);
 344         } else {
 345                 return page_to_phys(vmalloc_to_page(kaddr)) +
 346                        offset_in_page(kaddr);
 347         }
 348 }
 349
 350 /**
 351  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
 352  * @from:       The virtual kernel start address of the range
 353  * @to:         The virtual kernel end address of the range (exclusive)
 354  *
 355  * The same virtual address as the kernel virtual address is also used
 356  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
 357  * physical pages.
 358  */
 359 int create_hyp_mappings(void *from, void *to)
 360 {
 361         phys_addr_t phys_addr;
 362         unsigned long virt_addr;
 363         unsigned long start = KERN_TO_HYP((unsigned long)from);
 364         unsigned long end = KERN_TO_HYP((unsigned long)to);
 365
 366         start = start & PAGE_MASK;
 367         end = PAGE_ALIGN(end);
 368
 369         for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
 370                 int err;
 371
 372                 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
 373                 err = __create_hyp_mappings(hyp_pgd, virt_addr,
 374                                             virt_addr + PAGE_SIZE,
 375                                             __phys_to_pfn(phys_addr),
 376                                             PAGE_HYP);
 377                 if (err)
 378                         return err;
 379         }
 380
 381         return 0;
 382 }
 383
 384 /**
 385  * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
 386  * @from:       The kernel start VA of the range
 387  * @to:         The kernel end VA of the range (exclusive)
 388  * @phys_addr:  The physical start address which gets mapped
 389  *
 390  * The resulting HYP VA is the same as the kernel VA, modulo
 391  * HYP_PAGE_OFFSET.
 392  */
 393 int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 394 {
 395         unsigned long start = KERN_TO_HYP((unsigned long)from);
 396         unsigned long end = KERN_TO_HYP((unsigned long)to);
 397
 398         /* Check for a valid kernel IO mapping */
 399         if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
 400                 return -EINVAL;
 401
 402         return __create_hyp_mappings(hyp_pgd, start, end,
 403                                      __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
 404 }
 405
 406 /**
 407  * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
 408  * @kvm:        The KVM struct pointer for the VM.
 409  *
 410  * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can
 411  * support either full 40-bit input addresses or limited to 32-bit input
 412  * addresses). Clears the allocated pages.
 413  *
 414  * Note we don't need locking here as this is only called when the VM is
 415  * created, which can only be done once.
 416  */
 417 int kvm_alloc_stage2_pgd(struct kvm *kvm)
 418 {
 419         pgd_t *pgd;
 420
 421         if (kvm->arch.pgd != NULL) {
 422                 kvm_err("kvm_arch already initialized?\n");
 423                 return -EINVAL;
 424         }
 425
 426         pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, S2_PGD_ORDER);
 427         if (!pgd)
 428                 return -ENOMEM;
 429
 430         memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t));
 431         kvm_clean_pgd(pgd);
 432         kvm->arch.pgd = pgd;
 433
 434         return 0;
 435 }
 436
 437 /**
 438  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
 439  * @kvm:   The VM pointer
 440  * @start: The intermediate physical base address of the range to unmap
 441  * @size:  The size of the area to unmap
 442  *
 443  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
 444  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
 445  * destroying the VM), otherwise another faulting VCPU may come in and mess
 446  * with things behind our backs.
 447  */
 448 static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
 449 {
 450         unmap_range(kvm, kvm->arch.pgd, start, size);
 451 }
 452
 453 /**
 454  * kvm_free_stage2_pgd - free all stage-2 tables
 455  * @kvm:        The KVM struct pointer for the VM.
 456  *
 457  * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
 458  * underlying level-2 and level-3 tables before freeing the actual level-1 table
 459  * and setting the struct pointer to NULL.
 460  *
 461  * Note we don't need locking here as this is only called when the VM is
 462  * destroyed, which can only be done once.
 463  */
 464 void kvm_free_stage2_pgd(struct kvm *kvm)
 465 {
 466         if (kvm->arch.pgd == NULL)
 467                 return;
 468
 469         unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
 470         free_pages((unsigned long)kvm->arch.pgd, S2_PGD_ORDER);
 471         kvm->arch.pgd = NULL;
 472 }
 473
 474 static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 475                              phys_addr_t addr)
 476 {
 477         pgd_t *pgd;
 478         pud_t *pud;
 479         pmd_t *pmd;
 480
 481         pgd = kvm->arch.pgd + pgd_index(addr);
 482         pud = pud_offset(pgd, addr);
 483         if (pud_none(*pud)) {
 484                 if (!cache)
 485                         return NULL;
 486                 pmd = mmu_memory_cache_alloc(cache);
 487                 pud_populate(NULL, pud, pmd);
 488                 get_page(virt_to_page(pud));
 489         }
 490
 491         return pmd_offset(pud, addr);
 492 }
 493
 494 static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 495                                *cache, phys_addr_t addr, const pmd_t *new_pmd)
 496 {
 497         pmd_t *pmd, old_pmd;
 498
 499         pmd = stage2_get_pmd(kvm, cache, addr);
 500         VM_BUG_ON(!pmd);
 501
 502         /*
 503          * Mapping in huge pages should only happen through a fault.  If a
 504          * page is merged into a transparent huge page, the individual
 505          * subpages of that huge page should be unmapped through MMU
 506          * notifiers before we get here.
 507          *
 508          * Merging of CompoundPages is not supported; they should become
 509          * splitting first, unmapped, merged, and mapped back in on-demand.
 510          */
 511         VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
 512
 513         old_pmd = *pmd;
 514         kvm_set_pmd(pmd, *new_pmd);
 515         if (pmd_present(old_pmd))
 516                 kvm_tlb_flush_vmid_ipa(kvm, addr);
 517         else
 518                 get_page(virt_to_page(pmd));
 519         return 0;
 520 }
 521
 522 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 523                           phys_addr_t addr, const pte_t *new_pte, bool iomap)
 524 {
 525         pmd_t *pmd;
 526         pte_t *pte, old_pte;
 527
 528         /* Create stage-2 page table mapping - Level 1 */
 529         pmd = stage2_get_pmd(kvm, cache, addr);
 530         if (!pmd) {
 531                 /*
 532                  * Ignore calls from kvm_set_spte_hva for unallocated
 533                  * address ranges.
 534                  */
 535                 return 0;
 536         }
 537
 538         /* Create stage-2 page mappings - Level 2 */
 539         if (pmd_none(*pmd)) {
 540                 if (!cache)
 541                         return 0; /* ignore calls from kvm_set_spte_hva */
 542                 pte = mmu_memory_cache_alloc(cache);
 543                 kvm_clean_pte(pte);
 544                 pmd_populate_kernel(NULL, pmd, pte);
 545                 get_page(virt_to_page(pmd));
 546         }
 547
 548         pte = pte_offset_kernel(pmd, addr);
 549
 550         if (iomap && pte_present(*pte))
 551                 return -EFAULT;
 552
 553         /* Create 2nd stage page table mapping - Level 3 */
 554         old_pte = *pte;
 555         kvm_set_pte(pte, *new_pte);
 556         if (pte_present(old_pte))
 557                 kvm_tlb_flush_vmid_ipa(kvm, addr);
 558         else
 559                 get_page(virt_to_page(pte));
 560
 561         return 0;
 562 }
 563
 564 /**
 565  * kvm_phys_addr_ioremap - map a device range to guest IPA
 566  *
 567  * @kvm:        The KVM pointer
 568  * @guest_ipa:  The IPA at which to insert the mapping
 569  * @pa:         The physical address of the device
 570  * @size:       The size of the mapping
 571  */
 572 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 573                           phys_addr_t pa, unsigned long size)
 574 {
 575         phys_addr_t addr, end;
 576         int ret = 0;
 577         unsigned long pfn;
 578         struct kvm_mmu_memory_cache cache = { 0, };
 579
 580         end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
 581         pfn = __phys_to_pfn(pa);
 582
 583         for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
 584                 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
 585
 586                 ret = mmu_topup_memory_cache(&cache, 2, 2);
 587                 if (ret)
 588                         goto out;
 589                 spin_lock(&kvm->mmu_lock);
 590                 ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
 591                 spin_unlock(&kvm->mmu_lock);
 592                 if (ret)
 593                         goto out;
 594
 595                 pfn++;
 596         }
 597
 598 out:
 599         mmu_free_memory_cache(&cache);
 600         return ret;
 601 }
 602
 603 static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
 604 {
 605         pfn_t pfn = *pfnp;
 606         gfn_t gfn = *ipap >> PAGE_SHIFT;
 607
 608         if (PageTransCompound(pfn_to_page(pfn))) {
 609                 unsigned long mask;
 610                 /*
 611                  * The address we faulted on is backed by a transparent huge
 612                  * page.  However, because we map the compound huge page and
 613                  * not the individual tail page, we need to transfer the
 614                  * refcount to the head page.  We have to be careful that the
 615                  * THP doesn't start to split while we are adjusting the
 616                  * refcounts.
 617                  *
 618                  * We are sure this doesn't happen, because mmu_notifier_retry
 619                  * was successful and we are holding the mmu_lock, so if this
 620                  * THP is trying to split, it will be blocked in the mmu
 621                  * notifier before touching any of the pages, specifically
 622                  * before being able to call __split_huge_page_refcount().
 623                  *
 624                  * We can therefore safely transfer the refcount from PG_tail
 625                  * to PG_head and switch the pfn from a tail page to the head
 626                  * page accordingly.
 627                  */
 628                 mask = PTRS_PER_PMD - 1;
 629                 VM_BUG_ON((gfn & mask) != (pfn & mask));
 630                 if (pfn & mask) {
 631                         *ipap &= PMD_MASK;
 632                         kvm_release_pfn_clean(pfn);
 633                         pfn &= ~mask;
 634                         kvm_get_pfn(pfn);
 635                         *pfnp = pfn;
 636                 }
 637
 638                 return true;
 639         }
 640
 641         return false;
 642 }
 643
 644 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 645                           struct kvm_memory_slot *memslot,
 646                           unsigned long fault_status)
 647 {
 648         int ret;
 649         bool write_fault, writable, hugetlb = false, force_pte = false;
 650         unsigned long mmu_seq;
 651         gfn_t gfn = fault_ipa >> PAGE_SHIFT;
 652         unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
 653         struct kvm *kvm = vcpu->kvm;
 654         struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
 655         struct vm_area_struct *vma;
 656         pfn_t pfn;
 657
 658         write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
 659         if (fault_status == FSC_PERM && !write_fault) {
 660                 kvm_err("Unexpected L2 read permission error\n");
 661                 return -EFAULT;
 662         }
 663
 664         /* Let's check if we will get back a huge page backed by hugetlbfs */
 665         down_read(&current->mm->mmap_sem);
 666         vma = find_vma_intersection(current->mm, hva, hva + 1);
 667         if (is_vm_hugetlb_page(vma)) {
 668                 hugetlb = true;
 669                 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
 670         } else {
 671                 /*
 672                  * Pages belonging to memslots that don't have the same
 673                  * alignment for userspace and IPA cannot be mapped using
 674                  * block descriptors even if the pages belong to a THP for
 675                  * the process, because the stage-2 block descriptor will
 676                  * cover more than a single THP and we loose atomicity for
 677                  * unmapping, updates, and splits of the THP or other pages
 678                  * in the stage-2 block range.
 679                  */
 680                 if ((memslot->userspace_addr & ~PMD_MASK) !=
 681                     ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
 682                         force_pte = true;
 683         }
 684         up_read(&current->mm->mmap_sem);
 685
 686         /* We need minimum second+third level pages */
 687         ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
 688         if (ret)
 689                 return ret;
 690
 691         mmu_seq = vcpu->kvm->mmu_notifier_seq;
 692         /*
 693          * Ensure the read of mmu_notifier_seq happens before we call
 694          * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
 695          * the page we just got a reference to gets unmapped before we have a
 696          * chance to grab the mmu_lock, which ensure that if the page gets
 697          * unmapped afterwards, the call to kvm_unmap_hva will take it away
 698          * from us again properly. This smp_rmb() interacts with the smp_wmb()
 699          * in kvm_mmu_notifier_invalidate_<page|range_end>.
 700          */
 701         smp_rmb();
 702
 703         pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
 704         if (is_error_pfn(pfn))
 705                 return -EFAULT;
 706
 707         spin_lock(&kvm->mmu_lock);
 708         if (mmu_notifier_retry(kvm, mmu_seq))
 709                 goto out_unlock;
 710         if (!hugetlb && !force_pte)
 711                 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
 712
 713         if (hugetlb) {
 714                 pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
 715                 new_pmd = pmd_mkhuge(new_pmd);
 716                 if (writable) {
 717                         kvm_set_s2pmd_writable(&new_pmd);
 718                         kvm_set_pfn_dirty(pfn);
 719                 }
 720                 coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE);
 721                 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 722         } else {
 723                 pte_t new_pte = pfn_pte(pfn, PAGE_S2);
 724                 if (writable) {
 725                         kvm_set_s2pte_writable(&new_pte);
 726                         kvm_set_pfn_dirty(pfn);
 727                 }
 728                 coherent_icache_guest_page(kvm, hva, PAGE_SIZE);
 729                 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
 730         }
 731
 732
 733 out_unlock:
 734         spin_unlock(&kvm->mmu_lock);
 735         kvm_release_pfn_clean(pfn);
 736         return ret;
 737 }
 738
 739 /**
 740  * kvm_handle_guest_abort - handles all 2nd stage aborts
 741  * @vcpu:       the VCPU pointer
 742  * @run:        the kvm_run structure
 743  *
 744  * Any abort that gets to the host is almost guaranteed to be caused by a
 745  * missing second stage translation table entry, which can mean that either the
 746  * guest simply needs more memory and we must allocate an appropriate page or it
 747  * can mean that the guest tried to access I/O memory, which is emulated by user
 748  * space. The distinction is based on the IPA causing the fault and whether this
 749  * memory region has been registered as standard RAM by user space.
 750  */
 751 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
 752 {
 753         unsigned long fault_status;
 754         phys_addr_t fault_ipa;
 755         struct kvm_memory_slot *memslot;
 756         bool is_iabt;
 757         gfn_t gfn;
 758         int ret, idx;
 759
 760         is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
 761         fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
 762
 763         trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
 764                               kvm_vcpu_get_hfar(vcpu), fault_ipa);
 765
 766         /* Check the stage-2 fault is trans. fault or write fault */
 767         fault_status = kvm_vcpu_trap_get_fault(vcpu);
 768         if (fault_status != FSC_FAULT && fault_status != FSC_PERM) {
 769                 kvm_err("Unsupported fault status: EC=%#x DFCS=%#lx\n",
 770                         kvm_vcpu_trap_get_class(vcpu), fault_status);
 771                 return -EFAULT;
 772         }
 773
 774         idx = srcu_read_lock(&vcpu->kvm->srcu);
 775
 776         gfn = fault_ipa >> PAGE_SHIFT;
 777         if (!kvm_is_visible_gfn(vcpu->kvm, gfn)) {
 778                 if (is_iabt) {
 779                         /* Prefetch Abort on I/O address */
 780                         kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
 781                         ret = 1;
 782                         goto out_unlock;
 783                 }
 784
 785                 if (fault_status != FSC_FAULT) {
 786                         kvm_err("Unsupported fault status on io memory: %#lx\n",
 787                                 fault_status);
 788                         ret = -EFAULT;
 789                         goto out_unlock;
 790                 }
 791
 792                 /*
 793                  * The IPA is reported as [MAX:12], so we need to
 794                  * complement it with the bottom 12 bits from the
 795                  * faulting VA. This is always 12 bits, irrespective
 796                  * of the page size.
 797                  */
 798                 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
 799                 ret = io_mem_abort(vcpu, run, fault_ipa);
 800                 goto out_unlock;
 801         }
 802
 803         memslot = gfn_to_memslot(vcpu->kvm, gfn);
 804
 805         ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status);
 806         if (ret == 0)
 807                 ret = 1;
 808 out_unlock:
 809         srcu_read_unlock(&vcpu->kvm->srcu, idx);
 810         return ret;
 811 }
 812
 813 static void handle_hva_to_gpa(struct kvm *kvm,
 814                               unsigned long start,
 815                               unsigned long end,
 816                               void (*handler)(struct kvm *kvm,
 817                                               gpa_t gpa, void *data),
 818                               void *data)
 819 {
 820         struct kvm_memslots *slots;
 821         struct kvm_memory_slot *memslot;
 822
 823         slots = kvm_memslots(kvm);
 824
 825         /* we only care about the pages that the guest sees */
 826         kvm_for_each_memslot(memslot, slots) {
 827                 unsigned long hva_start, hva_end;
 828                 gfn_t gfn, gfn_end;
 829
 830                 hva_start = max(start, memslot->userspace_addr);
 831                 hva_end = min(end, memslot->userspace_addr +
 832                                         (memslot->npages << PAGE_SHIFT));
 833                 if (hva_start >= hva_end)
 834                         continue;
 835
 836                 /*
 837                  * {gfn(page) | page intersects with [hva_start, hva_end)} =
 838                  * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 839                  */
 840                 gfn = hva_to_gfn_memslot(hva_start, memslot);
 841                 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 842
 843                 for (; gfn < gfn_end; ++gfn) {
 844                         gpa_t gpa = gfn << PAGE_SHIFT;
 845                         handler(kvm, gpa, data);
 846                 }
 847         }
 848 }
 849
 850 static void kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
 851 {
 852         unmap_stage2_range(kvm, gpa, PAGE_SIZE);
 853 }
 854
 855 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 856 {
 857         unsigned long end = hva + PAGE_SIZE;
 858
 859         if (!kvm->arch.pgd)
 860                 return 0;
 861
 862         trace_kvm_unmap_hva(hva);
 863         handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
 864         return 0;
 865 }
 866
 867 int kvm_unmap_hva_range(struct kvm *kvm,
 868                         unsigned long start, unsigned long end)
 869 {
 870         if (!kvm->arch.pgd)
 871                 return 0;
 872
 873         trace_kvm_unmap_hva_range(start, end);
 874         handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
 875         return 0;
 876 }
 877
 878 static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
 879 {
 880         pte_t *pte = (pte_t *)data;
 881
 882         stage2_set_pte(kvm, NULL, gpa, pte, false);
 883 }
 884
 885
 886 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 887 {
 888         unsigned long end = hva + PAGE_SIZE;
 889         pte_t stage2_pte;
 890
 891         if (!kvm->arch.pgd)
 892                 return;
 893
 894         trace_kvm_set_spte_hva(hva);
 895         stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2);
 896         handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
 897 }
 898
 899 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 900 {
 901         mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
 902 }
 903
 904 phys_addr_t kvm_mmu_get_httbr(void)
 905 {
 906         return virt_to_phys(hyp_pgd);
 907 }
 908
 909 phys_addr_t kvm_mmu_get_boot_httbr(void)
 910 {
 911         return virt_to_phys(boot_hyp_pgd);
 912 }
 913
 914 phys_addr_t kvm_get_idmap_vector(void)
 915 {
 916         return hyp_idmap_vector;
 917 }
 918
 919 int kvm_mmu_init(void)
 920 {
 921         int err;
 922
 923         hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
 924         hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
 925         hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
 926
 927         if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) {
 928                 /*
 929                  * Our init code is crossing a page boundary. Allocate
 930                  * a bounce page, copy the code over and use that.
 931                  */
 932                 size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start;
 933                 phys_addr_t phys_base;
 934
 935                 init_bounce_page = (void *)__get_free_page(GFP_KERNEL);
 936                 if (!init_bounce_page) {
 937                         kvm_err("Couldn't allocate HYP init bounce page\n");
 938                         err = -ENOMEM;
 939                         goto out;
 940                 }
 941
 942                 memcpy(init_bounce_page, __hyp_idmap_text_start, len);
 943                 /*
 944                  * Warning: the code we just copied to the bounce page
 945                  * must be flushed to the point of coherency.
 946                  * Otherwise, the data may be sitting in L2, and HYP
 947                  * mode won't be able to observe it as it runs with
 948                  * caches off at that point.
 949                  */
 950                 kvm_flush_dcache_to_poc(init_bounce_page, len);
 951
 952                 phys_base = kvm_virt_to_phys(init_bounce_page);
 953                 hyp_idmap_vector += phys_base - hyp_idmap_start;
 954                 hyp_idmap_start = phys_base;
 955                 hyp_idmap_end = phys_base + len;
 956
 957                 kvm_info("Using HYP init bounce page @%lx\n",
 958                          (unsigned long)phys_base);
 959         }
 960
 961         hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, pgd_order);
 962         boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, pgd_order);
 963
 964         if (!hyp_pgd || !boot_hyp_pgd) {
 965                 kvm_err("Hyp mode PGD not allocated\n");
 966                 err = -ENOMEM;
 967                 goto out;
 968         }
 969
 970         /* Create the idmap in the boot page tables */
 971         err =   __create_hyp_mappings(boot_hyp_pgd,
 972                                       hyp_idmap_start, hyp_idmap_end,
 973                                       __phys_to_pfn(hyp_idmap_start),
 974                                       PAGE_HYP);
 975
 976         if (err) {
 977                 kvm_err("Failed to idmap %lx-%lx\n",
 978                         hyp_idmap_start, hyp_idmap_end);
 979                 goto out;
 980         }
 981
 982         /* Map the very same page at the trampoline VA */
 983         err =   __create_hyp_mappings(boot_hyp_pgd,
 984                                       TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
 985                                       __phys_to_pfn(hyp_idmap_start),
 986                                       PAGE_HYP);
 987         if (err) {
 988                 kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
 989                         TRAMPOLINE_VA);
 990                 goto out;
 991         }
 992
 993         /* Map the same page again into the runtime page tables */
 994         err =   __create_hyp_mappings(hyp_pgd,
 995                                       TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
 996                                       __phys_to_pfn(hyp_idmap_start),
 997                                       PAGE_HYP);
 998         if (err) {
 999                 kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
1000                         TRAMPOLINE_VA);
1001                 goto out;
1002         }
1003
1004         return 0;
1005 out:
1006         free_hyp_pgds();
1007         return err;
1008 }