KVM: arm64: Rework CPTR_EL2 programming for HVHE configuration
[platform/kernel/linux-starfive.git] / arch / arm64 / kvm / hyp / pgtable.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
4  * No bombay mix was harmed in the writing of this file.
5  *
6  * Copyright (C) 2020 Google LLC
7  * Author: Will Deacon <will@kernel.org>
8  */
9
10 #include <linux/bitfield.h>
11 #include <asm/kvm_pgtable.h>
12 #include <asm/stage2_pgtable.h>
13
14
15 #define KVM_PTE_TYPE                    BIT(1)
16 #define KVM_PTE_TYPE_BLOCK              0
17 #define KVM_PTE_TYPE_PAGE               1
18 #define KVM_PTE_TYPE_TABLE              1
19
20 #define KVM_PTE_LEAF_ATTR_LO            GENMASK(11, 2)
21
22 #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
23 #define KVM_PTE_LEAF_ATTR_LO_S1_AP      GENMASK(7, 6)
24 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO           \
25         ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; })
26 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW           \
27         ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; })
28 #define KVM_PTE_LEAF_ATTR_LO_S1_SH      GENMASK(9, 8)
29 #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS   3
30 #define KVM_PTE_LEAF_ATTR_LO_S1_AF      BIT(10)
31
32 #define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
33 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R  BIT(6)
34 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W  BIT(7)
35 #define KVM_PTE_LEAF_ATTR_LO_S2_SH      GENMASK(9, 8)
36 #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS   3
37 #define KVM_PTE_LEAF_ATTR_LO_S2_AF      BIT(10)
38
39 #define KVM_PTE_LEAF_ATTR_HI            GENMASK(63, 51)
40
41 #define KVM_PTE_LEAF_ATTR_HI_SW         GENMASK(58, 55)
42
43 #define KVM_PTE_LEAF_ATTR_HI_S1_XN      BIT(54)
44
45 #define KVM_PTE_LEAF_ATTR_HI_S2_XN      BIT(54)
46
47 #define KVM_PTE_LEAF_ATTR_S2_PERMS      (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
48                                          KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
49                                          KVM_PTE_LEAF_ATTR_HI_S2_XN)
50
51 #define KVM_INVALID_PTE_OWNER_MASK      GENMASK(9, 2)
52 #define KVM_MAX_OWNER_ID                1
53
54 /*
55  * Used to indicate a pte for which a 'break-before-make' sequence is in
56  * progress.
57  */
58 #define KVM_INVALID_PTE_LOCKED          BIT(10)
59
60 struct kvm_pgtable_walk_data {
61         struct kvm_pgtable_walker       *walker;
62
63         const u64                       start;
64         u64                             addr;
65         const u64                       end;
66 };
67
68 static bool kvm_phys_is_valid(u64 phys)
69 {
70         return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
71 }
72
73 static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
74 {
75         u64 granule = kvm_granule_size(ctx->level);
76
77         if (!kvm_level_supports_block_mapping(ctx->level))
78                 return false;
79
80         if (granule > (ctx->end - ctx->addr))
81                 return false;
82
83         if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
84                 return false;
85
86         return IS_ALIGNED(ctx->addr, granule);
87 }
88
89 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
90 {
91         u64 shift = kvm_granule_shift(level);
92         u64 mask = BIT(PAGE_SHIFT - 3) - 1;
93
94         return (data->addr >> shift) & mask;
95 }
96
97 static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
98 {
99         u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
100         u64 mask = BIT(pgt->ia_bits) - 1;
101
102         return (addr & mask) >> shift;
103 }
104
105 static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
106 {
107         struct kvm_pgtable pgt = {
108                 .ia_bits        = ia_bits,
109                 .start_level    = start_level,
110         };
111
112         return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
113 }
114
115 static bool kvm_pte_table(kvm_pte_t pte, u32 level)
116 {
117         if (level == KVM_PGTABLE_MAX_LEVELS - 1)
118                 return false;
119
120         if (!kvm_pte_valid(pte))
121                 return false;
122
123         return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
124 }
125
126 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
127 {
128         return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
129 }
130
131 static void kvm_clear_pte(kvm_pte_t *ptep)
132 {
133         WRITE_ONCE(*ptep, 0);
134 }
135
136 static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops)
137 {
138         kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
139
140         pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
141         pte |= KVM_PTE_VALID;
142         return pte;
143 }
144
145 static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
146 {
147         kvm_pte_t pte = kvm_phys_to_pte(pa);
148         u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
149                                                            KVM_PTE_TYPE_BLOCK;
150
151         pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
152         pte |= FIELD_PREP(KVM_PTE_TYPE, type);
153         pte |= KVM_PTE_VALID;
154
155         return pte;
156 }
157
158 static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
159 {
160         return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
161 }
162
163 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
164                                   const struct kvm_pgtable_visit_ctx *ctx,
165                                   enum kvm_pgtable_walk_flags visit)
166 {
167         struct kvm_pgtable_walker *walker = data->walker;
168
169         /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */
170         WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held());
171         return walker->cb(ctx, visit);
172 }
173
174 static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker,
175                                       int r)
176 {
177         /*
178          * Visitor callbacks return EAGAIN when the conditions that led to a
179          * fault are no longer reflected in the page tables due to a race to
180          * update a PTE. In the context of a fault handler this is interpreted
181          * as a signal to retry guest execution.
182          *
183          * Ignore the return code altogether for walkers outside a fault handler
184          * (e.g. write protecting a range of memory) and chug along with the
185          * page table walk.
186          */
187         if (r == -EAGAIN)
188                 return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT);
189
190         return !r;
191 }
192
193 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
194                               struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level);
195
196 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
197                                       struct kvm_pgtable_mm_ops *mm_ops,
198                                       kvm_pteref_t pteref, u32 level)
199 {
200         enum kvm_pgtable_walk_flags flags = data->walker->flags;
201         kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref);
202         struct kvm_pgtable_visit_ctx ctx = {
203                 .ptep   = ptep,
204                 .old    = READ_ONCE(*ptep),
205                 .arg    = data->walker->arg,
206                 .mm_ops = mm_ops,
207                 .start  = data->start,
208                 .addr   = data->addr,
209                 .end    = data->end,
210                 .level  = level,
211                 .flags  = flags,
212         };
213         int ret = 0;
214         kvm_pteref_t childp;
215         bool table = kvm_pte_table(ctx.old, level);
216
217         if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE))
218                 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE);
219
220         if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) {
221                 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF);
222                 ctx.old = READ_ONCE(*ptep);
223                 table = kvm_pte_table(ctx.old, level);
224         }
225
226         if (!kvm_pgtable_walk_continue(data->walker, ret))
227                 goto out;
228
229         if (!table) {
230                 data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
231                 data->addr += kvm_granule_size(level);
232                 goto out;
233         }
234
235         childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops);
236         ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1);
237         if (!kvm_pgtable_walk_continue(data->walker, ret))
238                 goto out;
239
240         if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST)
241                 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST);
242
243 out:
244         if (kvm_pgtable_walk_continue(data->walker, ret))
245                 return 0;
246
247         return ret;
248 }
249
250 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
251                               struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level)
252 {
253         u32 idx;
254         int ret = 0;
255
256         if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
257                 return -EINVAL;
258
259         for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
260                 kvm_pteref_t pteref = &pgtable[idx];
261
262                 if (data->addr >= data->end)
263                         break;
264
265                 ret = __kvm_pgtable_visit(data, mm_ops, pteref, level);
266                 if (ret)
267                         break;
268         }
269
270         return ret;
271 }
272
273 static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data)
274 {
275         u32 idx;
276         int ret = 0;
277         u64 limit = BIT(pgt->ia_bits);
278
279         if (data->addr > limit || data->end > limit)
280                 return -ERANGE;
281
282         if (!pgt->pgd)
283                 return -EINVAL;
284
285         for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) {
286                 kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE];
287
288                 ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level);
289                 if (ret)
290                         break;
291         }
292
293         return ret;
294 }
295
296 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
297                      struct kvm_pgtable_walker *walker)
298 {
299         struct kvm_pgtable_walk_data walk_data = {
300                 .start  = ALIGN_DOWN(addr, PAGE_SIZE),
301                 .addr   = ALIGN_DOWN(addr, PAGE_SIZE),
302                 .end    = PAGE_ALIGN(walk_data.addr + size),
303                 .walker = walker,
304         };
305         int r;
306
307         r = kvm_pgtable_walk_begin(walker);
308         if (r)
309                 return r;
310
311         r = _kvm_pgtable_walk(pgt, &walk_data);
312         kvm_pgtable_walk_end(walker);
313
314         return r;
315 }
316
317 struct leaf_walk_data {
318         kvm_pte_t       pte;
319         u32             level;
320 };
321
322 static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
323                        enum kvm_pgtable_walk_flags visit)
324 {
325         struct leaf_walk_data *data = ctx->arg;
326
327         data->pte   = ctx->old;
328         data->level = ctx->level;
329
330         return 0;
331 }
332
333 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
334                          kvm_pte_t *ptep, u32 *level)
335 {
336         struct leaf_walk_data data;
337         struct kvm_pgtable_walker walker = {
338                 .cb     = leaf_walker,
339                 .flags  = KVM_PGTABLE_WALK_LEAF,
340                 .arg    = &data,
341         };
342         int ret;
343
344         ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
345                                PAGE_SIZE, &walker);
346         if (!ret) {
347                 if (ptep)
348                         *ptep  = data.pte;
349                 if (level)
350                         *level = data.level;
351         }
352
353         return ret;
354 }
355
356 struct hyp_map_data {
357         const u64                       phys;
358         kvm_pte_t                       attr;
359 };
360
361 static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
362 {
363         bool device = prot & KVM_PGTABLE_PROT_DEVICE;
364         u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
365         kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
366         u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
367         u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
368                                                KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
369
370         if (!(prot & KVM_PGTABLE_PROT_R))
371                 return -EINVAL;
372
373         if (prot & KVM_PGTABLE_PROT_X) {
374                 if (prot & KVM_PGTABLE_PROT_W)
375                         return -EINVAL;
376
377                 if (device)
378                         return -EINVAL;
379         } else {
380                 attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
381         }
382
383         attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
384         attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
385         attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
386         attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
387         *ptep = attr;
388
389         return 0;
390 }
391
392 enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
393 {
394         enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
395         u32 ap;
396
397         if (!kvm_pte_valid(pte))
398                 return prot;
399
400         if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
401                 prot |= KVM_PGTABLE_PROT_X;
402
403         ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
404         if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
405                 prot |= KVM_PGTABLE_PROT_R;
406         else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
407                 prot |= KVM_PGTABLE_PROT_RW;
408
409         return prot;
410 }
411
412 static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
413                                     struct hyp_map_data *data)
414 {
415         u64 phys = data->phys + (ctx->addr - ctx->start);
416         kvm_pte_t new;
417
418         if (!kvm_block_mapping_supported(ctx, phys))
419                 return false;
420
421         new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
422         if (ctx->old == new)
423                 return true;
424         if (!kvm_pte_valid(ctx->old))
425                 ctx->mm_ops->get_page(ctx->ptep);
426         else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
427                 return false;
428
429         smp_store_release(ctx->ptep, new);
430         return true;
431 }
432
433 static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
434                           enum kvm_pgtable_walk_flags visit)
435 {
436         kvm_pte_t *childp, new;
437         struct hyp_map_data *data = ctx->arg;
438         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
439
440         if (hyp_map_walker_try_leaf(ctx, data))
441                 return 0;
442
443         if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1))
444                 return -EINVAL;
445
446         childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
447         if (!childp)
448                 return -ENOMEM;
449
450         new = kvm_init_table_pte(childp, mm_ops);
451         mm_ops->get_page(ctx->ptep);
452         smp_store_release(ctx->ptep, new);
453
454         return 0;
455 }
456
457 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
458                         enum kvm_pgtable_prot prot)
459 {
460         int ret;
461         struct hyp_map_data map_data = {
462                 .phys   = ALIGN_DOWN(phys, PAGE_SIZE),
463         };
464         struct kvm_pgtable_walker walker = {
465                 .cb     = hyp_map_walker,
466                 .flags  = KVM_PGTABLE_WALK_LEAF,
467                 .arg    = &map_data,
468         };
469
470         ret = hyp_set_prot_attr(prot, &map_data.attr);
471         if (ret)
472                 return ret;
473
474         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
475         dsb(ishst);
476         isb();
477         return ret;
478 }
479
480 static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
481                             enum kvm_pgtable_walk_flags visit)
482 {
483         kvm_pte_t *childp = NULL;
484         u64 granule = kvm_granule_size(ctx->level);
485         u64 *unmapped = ctx->arg;
486         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
487
488         if (!kvm_pte_valid(ctx->old))
489                 return -EINVAL;
490
491         if (kvm_pte_table(ctx->old, ctx->level)) {
492                 childp = kvm_pte_follow(ctx->old, mm_ops);
493
494                 if (mm_ops->page_count(childp) != 1)
495                         return 0;
496
497                 kvm_clear_pte(ctx->ptep);
498                 dsb(ishst);
499                 __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
500         } else {
501                 if (ctx->end - ctx->addr < granule)
502                         return -EINVAL;
503
504                 kvm_clear_pte(ctx->ptep);
505                 dsb(ishst);
506                 __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
507                 *unmapped += granule;
508         }
509
510         dsb(ish);
511         isb();
512         mm_ops->put_page(ctx->ptep);
513
514         if (childp)
515                 mm_ops->put_page(childp);
516
517         return 0;
518 }
519
520 u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
521 {
522         u64 unmapped = 0;
523         struct kvm_pgtable_walker walker = {
524                 .cb     = hyp_unmap_walker,
525                 .arg    = &unmapped,
526                 .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
527         };
528
529         if (!pgt->mm_ops->page_count)
530                 return 0;
531
532         kvm_pgtable_walk(pgt, addr, size, &walker);
533         return unmapped;
534 }
535
536 int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
537                          struct kvm_pgtable_mm_ops *mm_ops)
538 {
539         u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
540
541         pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL);
542         if (!pgt->pgd)
543                 return -ENOMEM;
544
545         pgt->ia_bits            = va_bits;
546         pgt->start_level        = KVM_PGTABLE_MAX_LEVELS - levels;
547         pgt->mm_ops             = mm_ops;
548         pgt->mmu                = NULL;
549         pgt->force_pte_cb       = NULL;
550
551         return 0;
552 }
553
554 static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
555                            enum kvm_pgtable_walk_flags visit)
556 {
557         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
558
559         if (!kvm_pte_valid(ctx->old))
560                 return 0;
561
562         mm_ops->put_page(ctx->ptep);
563
564         if (kvm_pte_table(ctx->old, ctx->level))
565                 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
566
567         return 0;
568 }
569
570 void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
571 {
572         struct kvm_pgtable_walker walker = {
573                 .cb     = hyp_free_walker,
574                 .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
575         };
576
577         WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
578         pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd));
579         pgt->pgd = NULL;
580 }
581
582 struct stage2_map_data {
583         const u64                       phys;
584         kvm_pte_t                       attr;
585         u8                              owner_id;
586
587         kvm_pte_t                       *anchor;
588         kvm_pte_t                       *childp;
589
590         struct kvm_s2_mmu               *mmu;
591         void                            *memcache;
592
593         /* Force mappings to page granularity */
594         bool                            force_pte;
595 };
596
597 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
598 {
599         u64 vtcr = VTCR_EL2_FLAGS;
600         u8 lvls;
601
602         vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
603         vtcr |= VTCR_EL2_T0SZ(phys_shift);
604         /*
605          * Use a minimum 2 level page table to prevent splitting
606          * host PMD huge pages at stage2.
607          */
608         lvls = stage2_pgtable_levels(phys_shift);
609         if (lvls < 2)
610                 lvls = 2;
611         vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
612
613 #ifdef CONFIG_ARM64_HW_AFDBM
614         /*
615          * Enable the Hardware Access Flag management, unconditionally
616          * on all CPUs. The features is RES0 on CPUs without the support
617          * and must be ignored by the CPUs.
618          */
619         vtcr |= VTCR_EL2_HA;
620 #endif /* CONFIG_ARM64_HW_AFDBM */
621
622         /* Set the vmid bits */
623         vtcr |= (get_vmid_bits(mmfr1) == 16) ?
624                 VTCR_EL2_VS_16BIT :
625                 VTCR_EL2_VS_8BIT;
626
627         return vtcr;
628 }
629
630 static bool stage2_has_fwb(struct kvm_pgtable *pgt)
631 {
632         if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
633                 return false;
634
635         return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
636 }
637
638 #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
639
640 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
641                                 kvm_pte_t *ptep)
642 {
643         bool device = prot & KVM_PGTABLE_PROT_DEVICE;
644         kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
645                             KVM_S2_MEMATTR(pgt, NORMAL);
646         u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
647
648         if (!(prot & KVM_PGTABLE_PROT_X))
649                 attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
650         else if (device)
651                 return -EINVAL;
652
653         if (prot & KVM_PGTABLE_PROT_R)
654                 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
655
656         if (prot & KVM_PGTABLE_PROT_W)
657                 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
658
659         attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
660         attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
661         attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
662         *ptep = attr;
663
664         return 0;
665 }
666
667 enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
668 {
669         enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
670
671         if (!kvm_pte_valid(pte))
672                 return prot;
673
674         if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
675                 prot |= KVM_PGTABLE_PROT_R;
676         if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
677                 prot |= KVM_PGTABLE_PROT_W;
678         if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
679                 prot |= KVM_PGTABLE_PROT_X;
680
681         return prot;
682 }
683
684 static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
685 {
686         if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
687                 return true;
688
689         return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
690 }
691
692 static bool stage2_pte_is_counted(kvm_pte_t pte)
693 {
694         /*
695          * The refcount tracks valid entries as well as invalid entries if they
696          * encode ownership of a page to another entity than the page-table
697          * owner, whose id is 0.
698          */
699         return !!pte;
700 }
701
702 static bool stage2_pte_is_locked(kvm_pte_t pte)
703 {
704         return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
705 }
706
707 static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
708 {
709         if (!kvm_pgtable_walk_shared(ctx)) {
710                 WRITE_ONCE(*ctx->ptep, new);
711                 return true;
712         }
713
714         return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
715 }
716
717 /**
718  * stage2_try_break_pte() - Invalidates a pte according to the
719  *                          'break-before-make' requirements of the
720  *                          architecture.
721  *
722  * @ctx: context of the visited pte.
723  * @mmu: stage-2 mmu
724  *
725  * Returns: true if the pte was successfully broken.
726  *
727  * If the removed pte was valid, performs the necessary serialization and TLB
728  * invalidation for the old value. For counted ptes, drops the reference count
729  * on the containing table page.
730  */
731 static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
732                                  struct kvm_s2_mmu *mmu)
733 {
734         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
735
736         if (stage2_pte_is_locked(ctx->old)) {
737                 /*
738                  * Should never occur if this walker has exclusive access to the
739                  * page tables.
740                  */
741                 WARN_ON(!kvm_pgtable_walk_shared(ctx));
742                 return false;
743         }
744
745         if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
746                 return false;
747
748         /*
749          * Perform the appropriate TLB invalidation based on the evicted pte
750          * value (if any).
751          */
752         if (kvm_pte_table(ctx->old, ctx->level))
753                 kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
754         else if (kvm_pte_valid(ctx->old))
755                 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
756
757         if (stage2_pte_is_counted(ctx->old))
758                 mm_ops->put_page(ctx->ptep);
759
760         return true;
761 }
762
763 static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
764 {
765         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
766
767         WARN_ON(!stage2_pte_is_locked(*ctx->ptep));
768
769         if (stage2_pte_is_counted(new))
770                 mm_ops->get_page(ctx->ptep);
771
772         smp_store_release(ctx->ptep, new);
773 }
774
775 static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
776                            struct kvm_pgtable_mm_ops *mm_ops)
777 {
778         /*
779          * Clear the existing PTE, and perform break-before-make with
780          * TLB maintenance if it was valid.
781          */
782         if (kvm_pte_valid(ctx->old)) {
783                 kvm_clear_pte(ctx->ptep);
784                 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
785         }
786
787         mm_ops->put_page(ctx->ptep);
788 }
789
790 static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
791 {
792         u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
793         return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
794 }
795
796 static bool stage2_pte_executable(kvm_pte_t pte)
797 {
798         return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
799 }
800
801 static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
802                                        const struct stage2_map_data *data)
803 {
804         u64 phys = data->phys;
805
806         /*
807          * Stage-2 walks to update ownership data are communicated to the map
808          * walker using an invalid PA. Avoid offsetting an already invalid PA,
809          * which could overflow and make the address valid again.
810          */
811         if (!kvm_phys_is_valid(phys))
812                 return phys;
813
814         /*
815          * Otherwise, work out the correct PA based on how far the walk has
816          * gotten.
817          */
818         return phys + (ctx->addr - ctx->start);
819 }
820
821 static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
822                                         struct stage2_map_data *data)
823 {
824         u64 phys = stage2_map_walker_phys_addr(ctx, data);
825
826         if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1)))
827                 return false;
828
829         return kvm_block_mapping_supported(ctx, phys);
830 }
831
832 static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
833                                       struct stage2_map_data *data)
834 {
835         kvm_pte_t new;
836         u64 phys = stage2_map_walker_phys_addr(ctx, data);
837         u64 granule = kvm_granule_size(ctx->level);
838         struct kvm_pgtable *pgt = data->mmu->pgt;
839         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
840
841         if (!stage2_leaf_mapping_allowed(ctx, data))
842                 return -E2BIG;
843
844         if (kvm_phys_is_valid(phys))
845                 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
846         else
847                 new = kvm_init_invalid_leaf_owner(data->owner_id);
848
849         /*
850          * Skip updating the PTE if we are trying to recreate the exact
851          * same mapping or only change the access permissions. Instead,
852          * the vCPU will exit one more time from guest if still needed
853          * and then go through the path of relaxing permissions.
854          */
855         if (!stage2_pte_needs_update(ctx->old, new))
856                 return -EAGAIN;
857
858         if (!stage2_try_break_pte(ctx, data->mmu))
859                 return -EAGAIN;
860
861         /* Perform CMOs before installation of the guest stage-2 PTE */
862         if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
863                 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
864                                                 granule);
865
866         if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
867                 mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
868
869         stage2_make_pte(ctx, new);
870
871         return 0;
872 }
873
874 static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
875                                      struct stage2_map_data *data)
876 {
877         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
878         kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
879         int ret;
880
881         if (!stage2_leaf_mapping_allowed(ctx, data))
882                 return 0;
883
884         ret = stage2_map_walker_try_leaf(ctx, data);
885         if (ret)
886                 return ret;
887
888         mm_ops->free_removed_table(childp, ctx->level);
889         return 0;
890 }
891
892 static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
893                                 struct stage2_map_data *data)
894 {
895         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
896         kvm_pte_t *childp, new;
897         int ret;
898
899         ret = stage2_map_walker_try_leaf(ctx, data);
900         if (ret != -E2BIG)
901                 return ret;
902
903         if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1))
904                 return -EINVAL;
905
906         if (!data->memcache)
907                 return -ENOMEM;
908
909         childp = mm_ops->zalloc_page(data->memcache);
910         if (!childp)
911                 return -ENOMEM;
912
913         if (!stage2_try_break_pte(ctx, data->mmu)) {
914                 mm_ops->put_page(childp);
915                 return -EAGAIN;
916         }
917
918         /*
919          * If we've run into an existing block mapping then replace it with
920          * a table. Accesses beyond 'end' that fall within the new table
921          * will be mapped lazily.
922          */
923         new = kvm_init_table_pte(childp, mm_ops);
924         stage2_make_pte(ctx, new);
925
926         return 0;
927 }
928
929 /*
930  * The TABLE_PRE callback runs for table entries on the way down, looking
931  * for table entries which we could conceivably replace with a block entry
932  * for this mapping. If it finds one it replaces the entry and calls
933  * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table.
934  *
935  * Otherwise, the LEAF callback performs the mapping at the existing leaves
936  * instead.
937  */
938 static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
939                              enum kvm_pgtable_walk_flags visit)
940 {
941         struct stage2_map_data *data = ctx->arg;
942
943         switch (visit) {
944         case KVM_PGTABLE_WALK_TABLE_PRE:
945                 return stage2_map_walk_table_pre(ctx, data);
946         case KVM_PGTABLE_WALK_LEAF:
947                 return stage2_map_walk_leaf(ctx, data);
948         default:
949                 return -EINVAL;
950         }
951 }
952
953 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
954                            u64 phys, enum kvm_pgtable_prot prot,
955                            void *mc, enum kvm_pgtable_walk_flags flags)
956 {
957         int ret;
958         struct stage2_map_data map_data = {
959                 .phys           = ALIGN_DOWN(phys, PAGE_SIZE),
960                 .mmu            = pgt->mmu,
961                 .memcache       = mc,
962                 .force_pte      = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
963         };
964         struct kvm_pgtable_walker walker = {
965                 .cb             = stage2_map_walker,
966                 .flags          = flags |
967                                   KVM_PGTABLE_WALK_TABLE_PRE |
968                                   KVM_PGTABLE_WALK_LEAF,
969                 .arg            = &map_data,
970         };
971
972         if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
973                 return -EINVAL;
974
975         ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
976         if (ret)
977                 return ret;
978
979         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
980         dsb(ishst);
981         return ret;
982 }
983
984 int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
985                                  void *mc, u8 owner_id)
986 {
987         int ret;
988         struct stage2_map_data map_data = {
989                 .phys           = KVM_PHYS_INVALID,
990                 .mmu            = pgt->mmu,
991                 .memcache       = mc,
992                 .owner_id       = owner_id,
993                 .force_pte      = true,
994         };
995         struct kvm_pgtable_walker walker = {
996                 .cb             = stage2_map_walker,
997                 .flags          = KVM_PGTABLE_WALK_TABLE_PRE |
998                                   KVM_PGTABLE_WALK_LEAF,
999                 .arg            = &map_data,
1000         };
1001
1002         if (owner_id > KVM_MAX_OWNER_ID)
1003                 return -EINVAL;
1004
1005         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1006         return ret;
1007 }
1008
1009 static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
1010                                enum kvm_pgtable_walk_flags visit)
1011 {
1012         struct kvm_pgtable *pgt = ctx->arg;
1013         struct kvm_s2_mmu *mmu = pgt->mmu;
1014         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1015         kvm_pte_t *childp = NULL;
1016         bool need_flush = false;
1017
1018         if (!kvm_pte_valid(ctx->old)) {
1019                 if (stage2_pte_is_counted(ctx->old)) {
1020                         kvm_clear_pte(ctx->ptep);
1021                         mm_ops->put_page(ctx->ptep);
1022                 }
1023                 return 0;
1024         }
1025
1026         if (kvm_pte_table(ctx->old, ctx->level)) {
1027                 childp = kvm_pte_follow(ctx->old, mm_ops);
1028
1029                 if (mm_ops->page_count(childp) != 1)
1030                         return 0;
1031         } else if (stage2_pte_cacheable(pgt, ctx->old)) {
1032                 need_flush = !stage2_has_fwb(pgt);
1033         }
1034
1035         /*
1036          * This is similar to the map() path in that we unmap the entire
1037          * block entry and rely on the remaining portions being faulted
1038          * back lazily.
1039          */
1040         stage2_put_pte(ctx, mmu, mm_ops);
1041
1042         if (need_flush && mm_ops->dcache_clean_inval_poc)
1043                 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
1044                                                kvm_granule_size(ctx->level));
1045
1046         if (childp)
1047                 mm_ops->put_page(childp);
1048
1049         return 0;
1050 }
1051
1052 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
1053 {
1054         struct kvm_pgtable_walker walker = {
1055                 .cb     = stage2_unmap_walker,
1056                 .arg    = pgt,
1057                 .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
1058         };
1059
1060         return kvm_pgtable_walk(pgt, addr, size, &walker);
1061 }
1062
1063 struct stage2_attr_data {
1064         kvm_pte_t                       attr_set;
1065         kvm_pte_t                       attr_clr;
1066         kvm_pte_t                       pte;
1067         u32                             level;
1068 };
1069
1070 static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
1071                               enum kvm_pgtable_walk_flags visit)
1072 {
1073         kvm_pte_t pte = ctx->old;
1074         struct stage2_attr_data *data = ctx->arg;
1075         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1076
1077         if (!kvm_pte_valid(ctx->old))
1078                 return -EAGAIN;
1079
1080         data->level = ctx->level;
1081         data->pte = pte;
1082         pte &= ~data->attr_clr;
1083         pte |= data->attr_set;
1084
1085         /*
1086          * We may race with the CPU trying to set the access flag here,
1087          * but worst-case the access flag update gets lost and will be
1088          * set on the next access instead.
1089          */
1090         if (data->pte != pte) {
1091                 /*
1092                  * Invalidate instruction cache before updating the guest
1093                  * stage-2 PTE if we are going to add executable permission.
1094                  */
1095                 if (mm_ops->icache_inval_pou &&
1096                     stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
1097                         mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
1098                                                   kvm_granule_size(ctx->level));
1099
1100                 if (!stage2_try_set_pte(ctx, pte))
1101                         return -EAGAIN;
1102         }
1103
1104         return 0;
1105 }
1106
1107 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
1108                                     u64 size, kvm_pte_t attr_set,
1109                                     kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
1110                                     u32 *level, enum kvm_pgtable_walk_flags flags)
1111 {
1112         int ret;
1113         kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
1114         struct stage2_attr_data data = {
1115                 .attr_set       = attr_set & attr_mask,
1116                 .attr_clr       = attr_clr & attr_mask,
1117         };
1118         struct kvm_pgtable_walker walker = {
1119                 .cb             = stage2_attr_walker,
1120                 .arg            = &data,
1121                 .flags          = flags | KVM_PGTABLE_WALK_LEAF,
1122         };
1123
1124         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1125         if (ret)
1126                 return ret;
1127
1128         if (orig_pte)
1129                 *orig_pte = data.pte;
1130
1131         if (level)
1132                 *level = data.level;
1133         return 0;
1134 }
1135
1136 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
1137 {
1138         return stage2_update_leaf_attrs(pgt, addr, size, 0,
1139                                         KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
1140                                         NULL, NULL, 0);
1141 }
1142
1143 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
1144 {
1145         kvm_pte_t pte = 0;
1146         int ret;
1147
1148         ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
1149                                        &pte, NULL,
1150                                        KVM_PGTABLE_WALK_HANDLE_FAULT |
1151                                        KVM_PGTABLE_WALK_SHARED);
1152         if (!ret)
1153                 dsb(ishst);
1154
1155         return pte;
1156 }
1157
1158 kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
1159 {
1160         kvm_pte_t pte = 0;
1161         stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
1162                                  &pte, NULL, 0);
1163         /*
1164          * "But where's the TLBI?!", you scream.
1165          * "Over in the core code", I sigh.
1166          *
1167          * See the '->clear_flush_young()' callback on the KVM mmu notifier.
1168          */
1169         return pte;
1170 }
1171
1172 bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
1173 {
1174         kvm_pte_t pte = 0;
1175         stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
1176         return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
1177 }
1178
1179 int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
1180                                    enum kvm_pgtable_prot prot)
1181 {
1182         int ret;
1183         u32 level;
1184         kvm_pte_t set = 0, clr = 0;
1185
1186         if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
1187                 return -EINVAL;
1188
1189         if (prot & KVM_PGTABLE_PROT_R)
1190                 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
1191
1192         if (prot & KVM_PGTABLE_PROT_W)
1193                 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
1194
1195         if (prot & KVM_PGTABLE_PROT_X)
1196                 clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
1197
1198         ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level,
1199                                        KVM_PGTABLE_WALK_HANDLE_FAULT |
1200                                        KVM_PGTABLE_WALK_SHARED);
1201         if (!ret)
1202                 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
1203         return ret;
1204 }
1205
1206 static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
1207                                enum kvm_pgtable_walk_flags visit)
1208 {
1209         struct kvm_pgtable *pgt = ctx->arg;
1210         struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
1211
1212         if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old))
1213                 return 0;
1214
1215         if (mm_ops->dcache_clean_inval_poc)
1216                 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
1217                                                kvm_granule_size(ctx->level));
1218         return 0;
1219 }
1220
1221 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
1222 {
1223         struct kvm_pgtable_walker walker = {
1224                 .cb     = stage2_flush_walker,
1225                 .flags  = KVM_PGTABLE_WALK_LEAF,
1226                 .arg    = pgt,
1227         };
1228
1229         if (stage2_has_fwb(pgt))
1230                 return 0;
1231
1232         return kvm_pgtable_walk(pgt, addr, size, &walker);
1233 }
1234
1235
1236 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
1237                               struct kvm_pgtable_mm_ops *mm_ops,
1238                               enum kvm_pgtable_stage2_flags flags,
1239                               kvm_pgtable_force_pte_cb_t force_pte_cb)
1240 {
1241         size_t pgd_sz;
1242         u64 vtcr = mmu->arch->vtcr;
1243         u32 ia_bits = VTCR_EL2_IPA(vtcr);
1244         u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1245         u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1246
1247         pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1248         pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz);
1249         if (!pgt->pgd)
1250                 return -ENOMEM;
1251
1252         pgt->ia_bits            = ia_bits;
1253         pgt->start_level        = start_level;
1254         pgt->mm_ops             = mm_ops;
1255         pgt->mmu                = mmu;
1256         pgt->flags              = flags;
1257         pgt->force_pte_cb       = force_pte_cb;
1258
1259         /* Ensure zeroed PGD pages are visible to the hardware walker */
1260         dsb(ishst);
1261         return 0;
1262 }
1263
1264 size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
1265 {
1266         u32 ia_bits = VTCR_EL2_IPA(vtcr);
1267         u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1268         u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1269
1270         return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1271 }
1272
1273 static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
1274                               enum kvm_pgtable_walk_flags visit)
1275 {
1276         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1277
1278         if (!stage2_pte_is_counted(ctx->old))
1279                 return 0;
1280
1281         mm_ops->put_page(ctx->ptep);
1282
1283         if (kvm_pte_table(ctx->old, ctx->level))
1284                 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
1285
1286         return 0;
1287 }
1288
1289 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
1290 {
1291         size_t pgd_sz;
1292         struct kvm_pgtable_walker walker = {
1293                 .cb     = stage2_free_walker,
1294                 .flags  = KVM_PGTABLE_WALK_LEAF |
1295                           KVM_PGTABLE_WALK_TABLE_POST,
1296         };
1297
1298         WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
1299         pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
1300         pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
1301         pgt->pgd = NULL;
1302 }
1303
1304 void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
1305 {
1306         kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
1307         struct kvm_pgtable_walker walker = {
1308                 .cb     = stage2_free_walker,
1309                 .flags  = KVM_PGTABLE_WALK_LEAF |
1310                           KVM_PGTABLE_WALK_TABLE_POST,
1311         };
1312         struct kvm_pgtable_walk_data data = {
1313                 .walker = &walker,
1314
1315                 /*
1316                  * At this point the IPA really doesn't matter, as the page
1317                  * table being traversed has already been removed from the stage
1318                  * 2. Set an appropriate range to cover the entire page table.
1319                  */
1320                 .addr   = 0,
1321                 .end    = kvm_granule_size(level),
1322         };
1323
1324         WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1));
1325 }