a4da1817e19dd13c3b159763b9dabc187f8ed957
[platform/kernel/linux-starfive.git] / drivers / iommu / iommufd / io_pagetable.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18
19 #include "io_pagetable.h"
20 #include "double_span.h"
21
22 struct iopt_pages_list {
23         struct iopt_pages *pages;
24         struct iopt_area *area;
25         struct list_head next;
26         unsigned long start_byte;
27         unsigned long length;
28 };
29
30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31                                         struct io_pagetable *iopt,
32                                         unsigned long iova,
33                                         unsigned long last_iova)
34 {
35         lockdep_assert_held(&iopt->iova_rwsem);
36
37         iter->cur_iova = iova;
38         iter->last_iova = last_iova;
39         iter->area = iopt_area_iter_first(iopt, iova, iova);
40         if (!iter->area)
41                 return NULL;
42         if (!iter->area->pages) {
43                 iter->area = NULL;
44                 return NULL;
45         }
46         return iter->area;
47 }
48
49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50 {
51         unsigned long last_iova;
52
53         if (!iter->area)
54                 return NULL;
55         last_iova = iopt_area_last_iova(iter->area);
56         if (iter->last_iova <= last_iova)
57                 return NULL;
58
59         iter->cur_iova = last_iova + 1;
60         iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61                                          iter->last_iova);
62         if (!iter->area)
63                 return NULL;
64         if (iter->cur_iova != iopt_area_iova(iter->area) ||
65             !iter->area->pages) {
66                 iter->area = NULL;
67                 return NULL;
68         }
69         return iter->area;
70 }
71
72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73                                     unsigned long length,
74                                     unsigned long iova_alignment,
75                                     unsigned long page_offset)
76 {
77         if (span->is_used || span->last_hole - span->start_hole < length - 1)
78                 return false;
79
80         span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81                            page_offset;
82         if (span->start_hole > span->last_hole ||
83             span->last_hole - span->start_hole < length - 1)
84                 return false;
85         return true;
86 }
87
88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89                                     unsigned long length,
90                                     unsigned long iova_alignment,
91                                     unsigned long page_offset)
92 {
93         if (span->is_hole || span->last_used - span->start_used < length - 1)
94                 return false;
95
96         span->start_used = ALIGN(span->start_used, iova_alignment) |
97                            page_offset;
98         if (span->start_used > span->last_used ||
99             span->last_used - span->start_used < length - 1)
100                 return false;
101         return true;
102 }
103
104 /*
105  * Automatically find a block of IOVA that is not being used and not reserved.
106  * Does not return a 0 IOVA even if it is valid.
107  */
108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109                            unsigned long uptr, unsigned long length)
110 {
111         unsigned long page_offset = uptr % PAGE_SIZE;
112         struct interval_tree_double_span_iter used_span;
113         struct interval_tree_span_iter allowed_span;
114         unsigned long iova_alignment;
115
116         lockdep_assert_held(&iopt->iova_rwsem);
117
118         /* Protect roundup_pow-of_two() from overflow */
119         if (length == 0 || length >= ULONG_MAX / 2)
120                 return -EOVERFLOW;
121
122         /*
123          * Keep alignment present in the uptr when building the IOVA, this
124          * increases the chance we can map a THP.
125          */
126         if (!uptr)
127                 iova_alignment = roundup_pow_of_two(length);
128         else
129                 iova_alignment = min_t(unsigned long,
130                                        roundup_pow_of_two(length),
131                                        1UL << __ffs64(uptr));
132
133         if (iova_alignment < iopt->iova_alignment)
134                 return -EINVAL;
135
136         interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
137                                     PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
138                 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
139                         allowed_span.start_used = PAGE_SIZE;
140                         allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
141                         allowed_span.is_hole = false;
142                 }
143
144                 if (!__alloc_iova_check_used(&allowed_span, length,
145                                              iova_alignment, page_offset))
146                         continue;
147
148                 interval_tree_for_each_double_span(
149                         &used_span, &iopt->reserved_itree, &iopt->area_itree,
150                         allowed_span.start_used, allowed_span.last_used) {
151                         if (!__alloc_iova_check_hole(&used_span, length,
152                                                      iova_alignment,
153                                                      page_offset))
154                                 continue;
155
156                         *iova = used_span.start_hole;
157                         return 0;
158                 }
159         }
160         return -ENOSPC;
161 }
162
163 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
164                            unsigned long length)
165 {
166         unsigned long last;
167
168         lockdep_assert_held(&iopt->iova_rwsem);
169
170         if ((iova & (iopt->iova_alignment - 1)))
171                 return -EINVAL;
172
173         if (check_add_overflow(iova, length - 1, &last))
174                 return -EOVERFLOW;
175
176         /* No reserved IOVA intersects the range */
177         if (iopt_reserved_iter_first(iopt, iova, last))
178                 return -EINVAL;
179
180         /* Check that there is not already a mapping in the range */
181         if (iopt_area_iter_first(iopt, iova, last))
182                 return -EEXIST;
183         return 0;
184 }
185
186 /*
187  * The area takes a slice of the pages from start_bytes to start_byte + length
188  */
189 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
190                             struct iopt_pages *pages, unsigned long iova,
191                             unsigned long start_byte, unsigned long length,
192                             int iommu_prot)
193 {
194         lockdep_assert_held_write(&iopt->iova_rwsem);
195
196         if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
197                 return -EPERM;
198
199         area->iommu_prot = iommu_prot;
200         area->page_offset = start_byte % PAGE_SIZE;
201         if (area->page_offset & (iopt->iova_alignment - 1))
202                 return -EINVAL;
203
204         area->node.start = iova;
205         if (check_add_overflow(iova, length - 1, &area->node.last))
206                 return -EOVERFLOW;
207
208         area->pages_node.start = start_byte / PAGE_SIZE;
209         if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
210                 return -EOVERFLOW;
211         area->pages_node.last = area->pages_node.last / PAGE_SIZE;
212         if (WARN_ON(area->pages_node.last >= pages->npages))
213                 return -EOVERFLOW;
214
215         /*
216          * The area is inserted with a NULL pages indicating it is not fully
217          * initialized yet.
218          */
219         area->iopt = iopt;
220         interval_tree_insert(&area->node, &iopt->area_itree);
221         return 0;
222 }
223
224 static struct iopt_area *iopt_area_alloc(void)
225 {
226         struct iopt_area *area;
227
228         area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
229         if (!area)
230                 return NULL;
231         RB_CLEAR_NODE(&area->node.rb);
232         RB_CLEAR_NODE(&area->pages_node.rb);
233         return area;
234 }
235
236 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
237                                  struct list_head *pages_list,
238                                  unsigned long length, unsigned long *dst_iova,
239                                  int iommu_prot, unsigned int flags)
240 {
241         struct iopt_pages_list *elm;
242         unsigned long iova;
243         int rc = 0;
244
245         list_for_each_entry(elm, pages_list, next) {
246                 elm->area = iopt_area_alloc();
247                 if (!elm->area)
248                         return -ENOMEM;
249         }
250
251         down_write(&iopt->iova_rwsem);
252         if ((length & (iopt->iova_alignment - 1)) || !length) {
253                 rc = -EINVAL;
254                 goto out_unlock;
255         }
256
257         if (flags & IOPT_ALLOC_IOVA) {
258                 /* Use the first entry to guess the ideal IOVA alignment */
259                 elm = list_first_entry(pages_list, struct iopt_pages_list,
260                                        next);
261                 rc = iopt_alloc_iova(
262                         iopt, dst_iova,
263                         (uintptr_t)elm->pages->uptr + elm->start_byte, length);
264                 if (rc)
265                         goto out_unlock;
266                 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
267                     WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
268                         rc = -EINVAL;
269                         goto out_unlock;
270                 }
271         } else {
272                 rc = iopt_check_iova(iopt, *dst_iova, length);
273                 if (rc)
274                         goto out_unlock;
275         }
276
277         /*
278          * Areas are created with a NULL pages so that the IOVA space is
279          * reserved and we can unlock the iova_rwsem.
280          */
281         iova = *dst_iova;
282         list_for_each_entry(elm, pages_list, next) {
283                 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
284                                       elm->start_byte, elm->length, iommu_prot);
285                 if (rc)
286                         goto out_unlock;
287                 iova += elm->length;
288         }
289
290 out_unlock:
291         up_write(&iopt->iova_rwsem);
292         return rc;
293 }
294
295 static void iopt_abort_area(struct iopt_area *area)
296 {
297         if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
298                 WARN_ON(area->pages);
299         if (area->iopt) {
300                 down_write(&area->iopt->iova_rwsem);
301                 interval_tree_remove(&area->node, &area->iopt->area_itree);
302                 up_write(&area->iopt->iova_rwsem);
303         }
304         kfree(area);
305 }
306
307 void iopt_free_pages_list(struct list_head *pages_list)
308 {
309         struct iopt_pages_list *elm;
310
311         while ((elm = list_first_entry_or_null(pages_list,
312                                                struct iopt_pages_list, next))) {
313                 if (elm->area)
314                         iopt_abort_area(elm->area);
315                 if (elm->pages)
316                         iopt_put_pages(elm->pages);
317                 list_del(&elm->next);
318                 kfree(elm);
319         }
320 }
321
322 static int iopt_fill_domains_pages(struct list_head *pages_list)
323 {
324         struct iopt_pages_list *undo_elm;
325         struct iopt_pages_list *elm;
326         int rc;
327
328         list_for_each_entry(elm, pages_list, next) {
329                 rc = iopt_area_fill_domains(elm->area, elm->pages);
330                 if (rc)
331                         goto err_undo;
332         }
333         return 0;
334
335 err_undo:
336         list_for_each_entry(undo_elm, pages_list, next) {
337                 if (undo_elm == elm)
338                         break;
339                 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
340         }
341         return rc;
342 }
343
344 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
345                    unsigned long length, unsigned long *dst_iova,
346                    int iommu_prot, unsigned int flags)
347 {
348         struct iopt_pages_list *elm;
349         int rc;
350
351         rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
352                                    iommu_prot, flags);
353         if (rc)
354                 return rc;
355
356         down_read(&iopt->domains_rwsem);
357         rc = iopt_fill_domains_pages(pages_list);
358         if (rc)
359                 goto out_unlock_domains;
360
361         down_write(&iopt->iova_rwsem);
362         list_for_each_entry(elm, pages_list, next) {
363                 /*
364                  * area->pages must be set inside the domains_rwsem to ensure
365                  * any newly added domains will get filled. Moves the reference
366                  * in from the list.
367                  */
368                 elm->area->pages = elm->pages;
369                 elm->pages = NULL;
370                 elm->area = NULL;
371         }
372         up_write(&iopt->iova_rwsem);
373 out_unlock_domains:
374         up_read(&iopt->domains_rwsem);
375         return rc;
376 }
377
378 /**
379  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
380  * @ictx: iommufd_ctx the iopt is part of
381  * @iopt: io_pagetable to act on
382  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
383  *        the chosen iova on output. Otherwise is the iova to map to on input
384  * @uptr: User VA to map
385  * @length: Number of bytes to map
386  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
387  * @flags: IOPT_ALLOC_IOVA or zero
388  *
389  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
390  * page tables this will pin the pages and load them into the domain at iova.
391  * For non-domain page tables this will only setup a lazy reference and the
392  * caller must use iopt_access_pages() to touch them.
393  *
394  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
395  * destroyed.
396  */
397 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
398                         unsigned long *iova, void __user *uptr,
399                         unsigned long length, int iommu_prot,
400                         unsigned int flags)
401 {
402         struct iopt_pages_list elm = {};
403         LIST_HEAD(pages_list);
404         int rc;
405
406         elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
407         if (IS_ERR(elm.pages))
408                 return PTR_ERR(elm.pages);
409         if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
410             elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
411                 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
412         elm.start_byte = uptr - elm.pages->uptr;
413         elm.length = length;
414         list_add(&elm.next, &pages_list);
415
416         rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
417         if (rc) {
418                 if (elm.area)
419                         iopt_abort_area(elm.area);
420                 if (elm.pages)
421                         iopt_put_pages(elm.pages);
422                 return rc;
423         }
424         return 0;
425 }
426
427 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
428                    unsigned long length, struct list_head *pages_list)
429 {
430         struct iopt_area_contig_iter iter;
431         unsigned long last_iova;
432         struct iopt_area *area;
433         int rc;
434
435         if (!length)
436                 return -EINVAL;
437         if (check_add_overflow(iova, length - 1, &last_iova))
438                 return -EOVERFLOW;
439
440         down_read(&iopt->iova_rwsem);
441         iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
442                 struct iopt_pages_list *elm;
443                 unsigned long last = min(last_iova, iopt_area_last_iova(area));
444
445                 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
446                 if (!elm) {
447                         rc = -ENOMEM;
448                         goto err_free;
449                 }
450                 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
451                 elm->pages = area->pages;
452                 elm->length = (last - iter.cur_iova) + 1;
453                 kref_get(&elm->pages->kref);
454                 list_add_tail(&elm->next, pages_list);
455         }
456         if (!iopt_area_contig_done(&iter)) {
457                 rc = -ENOENT;
458                 goto err_free;
459         }
460         up_read(&iopt->iova_rwsem);
461         return 0;
462 err_free:
463         up_read(&iopt->iova_rwsem);
464         iopt_free_pages_list(pages_list);
465         return rc;
466 }
467
468 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
469                                  unsigned long last, unsigned long *unmapped)
470 {
471         struct iopt_area *area;
472         unsigned long unmapped_bytes = 0;
473         unsigned int tries = 0;
474         int rc = -ENOENT;
475
476         /*
477          * The domains_rwsem must be held in read mode any time any area->pages
478          * is NULL. This prevents domain attach/detatch from running
479          * concurrently with cleaning up the area.
480          */
481 again:
482         down_read(&iopt->domains_rwsem);
483         down_write(&iopt->iova_rwsem);
484         while ((area = iopt_area_iter_first(iopt, start, last))) {
485                 unsigned long area_last = iopt_area_last_iova(area);
486                 unsigned long area_first = iopt_area_iova(area);
487                 struct iopt_pages *pages;
488
489                 /* Userspace should not race map/unmap's of the same area */
490                 if (!area->pages) {
491                         rc = -EBUSY;
492                         goto out_unlock_iova;
493                 }
494
495                 if (area_first < start || area_last > last) {
496                         rc = -ENOENT;
497                         goto out_unlock_iova;
498                 }
499
500                 if (area_first != start)
501                         tries = 0;
502
503                 /*
504                  * num_accesses writers must hold the iova_rwsem too, so we can
505                  * safely read it under the write side of the iovam_rwsem
506                  * without the pages->mutex.
507                  */
508                 if (area->num_accesses) {
509                         size_t length = iopt_area_length(area);
510
511                         start = area_first;
512                         area->prevent_access = true;
513                         up_write(&iopt->iova_rwsem);
514                         up_read(&iopt->domains_rwsem);
515
516                         iommufd_access_notify_unmap(iopt, area_first, length);
517                         /* Something is not responding to unmap requests. */
518                         tries++;
519                         if (WARN_ON(tries > 100))
520                                 return -EDEADLOCK;
521                         goto again;
522                 }
523
524                 pages = area->pages;
525                 area->pages = NULL;
526                 up_write(&iopt->iova_rwsem);
527
528                 iopt_area_unfill_domains(area, pages);
529                 iopt_abort_area(area);
530                 iopt_put_pages(pages);
531
532                 unmapped_bytes += area_last - area_first + 1;
533
534                 down_write(&iopt->iova_rwsem);
535         }
536         if (unmapped_bytes)
537                 rc = 0;
538
539 out_unlock_iova:
540         up_write(&iopt->iova_rwsem);
541         up_read(&iopt->domains_rwsem);
542         if (unmapped)
543                 *unmapped = unmapped_bytes;
544         return rc;
545 }
546
547 /**
548  * iopt_unmap_iova() - Remove a range of iova
549  * @iopt: io_pagetable to act on
550  * @iova: Starting iova to unmap
551  * @length: Number of bytes to unmap
552  * @unmapped: Return number of bytes unmapped
553  *
554  * The requested range must be a superset of existing ranges.
555  * Splitting/truncating IOVA mappings is not allowed.
556  */
557 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
558                     unsigned long length, unsigned long *unmapped)
559 {
560         unsigned long iova_last;
561
562         if (!length)
563                 return -EINVAL;
564
565         if (check_add_overflow(iova, length - 1, &iova_last))
566                 return -EOVERFLOW;
567
568         return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
569 }
570
571 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
572 {
573         int rc;
574
575         rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
576         /* If the IOVAs are empty then unmap all succeeds */
577         if (rc == -ENOENT)
578                 return 0;
579         return rc;
580 }
581
582 /* The caller must always free all the nodes in the allowed_iova rb_root. */
583 int iopt_set_allow_iova(struct io_pagetable *iopt,
584                         struct rb_root_cached *allowed_iova)
585 {
586         struct iopt_allowed *allowed;
587
588         down_write(&iopt->iova_rwsem);
589         swap(*allowed_iova, iopt->allowed_itree);
590
591         for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
592              allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
593                 if (iopt_reserved_iter_first(iopt, allowed->node.start,
594                                              allowed->node.last)) {
595                         swap(*allowed_iova, iopt->allowed_itree);
596                         up_write(&iopt->iova_rwsem);
597                         return -EADDRINUSE;
598                 }
599         }
600         up_write(&iopt->iova_rwsem);
601         return 0;
602 }
603
604 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
605                       unsigned long last, void *owner)
606 {
607         struct iopt_reserved *reserved;
608
609         lockdep_assert_held_write(&iopt->iova_rwsem);
610
611         if (iopt_area_iter_first(iopt, start, last) ||
612             iopt_allowed_iter_first(iopt, start, last))
613                 return -EADDRINUSE;
614
615         reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
616         if (!reserved)
617                 return -ENOMEM;
618         reserved->node.start = start;
619         reserved->node.last = last;
620         reserved->owner = owner;
621         interval_tree_insert(&reserved->node, &iopt->reserved_itree);
622         return 0;
623 }
624
625 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
626 {
627         struct iopt_reserved *reserved, *next;
628
629         lockdep_assert_held_write(&iopt->iova_rwsem);
630
631         for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
632              reserved = next) {
633                 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
634
635                 if (reserved->owner == owner) {
636                         interval_tree_remove(&reserved->node,
637                                              &iopt->reserved_itree);
638                         kfree(reserved);
639                 }
640         }
641 }
642
643 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
644 {
645         down_write(&iopt->iova_rwsem);
646         __iopt_remove_reserved_iova(iopt, owner);
647         up_write(&iopt->iova_rwsem);
648 }
649
650 void iopt_init_table(struct io_pagetable *iopt)
651 {
652         init_rwsem(&iopt->iova_rwsem);
653         init_rwsem(&iopt->domains_rwsem);
654         iopt->area_itree = RB_ROOT_CACHED;
655         iopt->allowed_itree = RB_ROOT_CACHED;
656         iopt->reserved_itree = RB_ROOT_CACHED;
657         xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
658         xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
659
660         /*
661          * iopt's start as SW tables that can use the entire size_t IOVA space
662          * due to the use of size_t in the APIs. They have no alignment
663          * restriction.
664          */
665         iopt->iova_alignment = 1;
666 }
667
668 void iopt_destroy_table(struct io_pagetable *iopt)
669 {
670         struct interval_tree_node *node;
671
672         if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
673                 iopt_remove_reserved_iova(iopt, NULL);
674
675         while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
676                                                 ULONG_MAX))) {
677                 interval_tree_remove(node, &iopt->allowed_itree);
678                 kfree(container_of(node, struct iopt_allowed, node));
679         }
680
681         WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
682         WARN_ON(!xa_empty(&iopt->domains));
683         WARN_ON(!xa_empty(&iopt->access_list));
684         WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
685 }
686
687 /**
688  * iopt_unfill_domain() - Unfill a domain with PFNs
689  * @iopt: io_pagetable to act on
690  * @domain: domain to unfill
691  *
692  * This is used when removing a domain from the iopt. Every area in the iopt
693  * will be unmapped from the domain. The domain must already be removed from the
694  * domains xarray.
695  */
696 static void iopt_unfill_domain(struct io_pagetable *iopt,
697                                struct iommu_domain *domain)
698 {
699         struct iopt_area *area;
700
701         lockdep_assert_held(&iopt->iova_rwsem);
702         lockdep_assert_held_write(&iopt->domains_rwsem);
703
704         /*
705          * Some other domain is holding all the pfns still, rapidly unmap this
706          * domain.
707          */
708         if (iopt->next_domain_id != 0) {
709                 /* Pick an arbitrary remaining domain to act as storage */
710                 struct iommu_domain *storage_domain =
711                         xa_load(&iopt->domains, 0);
712
713                 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
714                      area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
715                         struct iopt_pages *pages = area->pages;
716
717                         if (!pages)
718                                 continue;
719
720                         mutex_lock(&pages->mutex);
721                         if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
722                                 WARN_ON(!area->storage_domain);
723                         if (area->storage_domain == domain)
724                                 area->storage_domain = storage_domain;
725                         mutex_unlock(&pages->mutex);
726
727                         iopt_area_unmap_domain(area, domain);
728                 }
729                 return;
730         }
731
732         for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
733              area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
734                 struct iopt_pages *pages = area->pages;
735
736                 if (!pages)
737                         continue;
738
739                 mutex_lock(&pages->mutex);
740                 interval_tree_remove(&area->pages_node, &pages->domains_itree);
741                 WARN_ON(area->storage_domain != domain);
742                 area->storage_domain = NULL;
743                 iopt_area_unfill_domain(area, pages, domain);
744                 mutex_unlock(&pages->mutex);
745         }
746 }
747
748 /**
749  * iopt_fill_domain() - Fill a domain with PFNs
750  * @iopt: io_pagetable to act on
751  * @domain: domain to fill
752  *
753  * Fill the domain with PFNs from every area in the iopt. On failure the domain
754  * is left unchanged.
755  */
756 static int iopt_fill_domain(struct io_pagetable *iopt,
757                             struct iommu_domain *domain)
758 {
759         struct iopt_area *end_area;
760         struct iopt_area *area;
761         int rc;
762
763         lockdep_assert_held(&iopt->iova_rwsem);
764         lockdep_assert_held_write(&iopt->domains_rwsem);
765
766         for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
767              area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
768                 struct iopt_pages *pages = area->pages;
769
770                 if (!pages)
771                         continue;
772
773                 mutex_lock(&pages->mutex);
774                 rc = iopt_area_fill_domain(area, domain);
775                 if (rc) {
776                         mutex_unlock(&pages->mutex);
777                         goto out_unfill;
778                 }
779                 if (!area->storage_domain) {
780                         WARN_ON(iopt->next_domain_id != 0);
781                         area->storage_domain = domain;
782                         interval_tree_insert(&area->pages_node,
783                                              &pages->domains_itree);
784                 }
785                 mutex_unlock(&pages->mutex);
786         }
787         return 0;
788
789 out_unfill:
790         end_area = area;
791         for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
792              area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
793                 struct iopt_pages *pages = area->pages;
794
795                 if (area == end_area)
796                         break;
797                 if (!pages)
798                         continue;
799                 mutex_lock(&pages->mutex);
800                 if (iopt->next_domain_id == 0) {
801                         interval_tree_remove(&area->pages_node,
802                                              &pages->domains_itree);
803                         area->storage_domain = NULL;
804                 }
805                 iopt_area_unfill_domain(area, pages, domain);
806                 mutex_unlock(&pages->mutex);
807         }
808         return rc;
809 }
810
811 /* All existing area's conform to an increased page size */
812 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
813                                      unsigned long new_iova_alignment)
814 {
815         unsigned long align_mask = new_iova_alignment - 1;
816         struct iopt_area *area;
817
818         lockdep_assert_held(&iopt->iova_rwsem);
819         lockdep_assert_held(&iopt->domains_rwsem);
820
821         for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
822              area = iopt_area_iter_next(area, 0, ULONG_MAX))
823                 if ((iopt_area_iova(area) & align_mask) ||
824                     (iopt_area_length(area) & align_mask) ||
825                     (area->page_offset & align_mask))
826                         return -EADDRINUSE;
827
828         if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
829                 struct iommufd_access *access;
830                 unsigned long index;
831
832                 xa_for_each(&iopt->access_list, index, access)
833                         if (WARN_ON(access->iova_alignment >
834                                     new_iova_alignment))
835                                 return -EADDRINUSE;
836         }
837         return 0;
838 }
839
840 int iopt_table_add_domain(struct io_pagetable *iopt,
841                           struct iommu_domain *domain)
842 {
843         const struct iommu_domain_geometry *geometry = &domain->geometry;
844         struct iommu_domain *iter_domain;
845         unsigned int new_iova_alignment;
846         unsigned long index;
847         int rc;
848
849         down_write(&iopt->domains_rwsem);
850         down_write(&iopt->iova_rwsem);
851
852         xa_for_each(&iopt->domains, index, iter_domain) {
853                 if (WARN_ON(iter_domain == domain)) {
854                         rc = -EEXIST;
855                         goto out_unlock;
856                 }
857         }
858
859         /*
860          * The io page size drives the iova_alignment. Internally the iopt_pages
861          * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
862          * objects into the iommu_domain.
863          *
864          * A iommu_domain must always be able to accept PAGE_SIZE to be
865          * compatible as we can't guarantee higher contiguity.
866          */
867         new_iova_alignment = max_t(unsigned long,
868                                    1UL << __ffs(domain->pgsize_bitmap),
869                                    iopt->iova_alignment);
870         if (new_iova_alignment > PAGE_SIZE) {
871                 rc = -EINVAL;
872                 goto out_unlock;
873         }
874         if (new_iova_alignment != iopt->iova_alignment) {
875                 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
876                 if (rc)
877                         goto out_unlock;
878         }
879
880         /* No area exists that is outside the allowed domain aperture */
881         if (geometry->aperture_start != 0) {
882                 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
883                                        domain);
884                 if (rc)
885                         goto out_reserved;
886         }
887         if (geometry->aperture_end != ULONG_MAX) {
888                 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
889                                        ULONG_MAX, domain);
890                 if (rc)
891                         goto out_reserved;
892         }
893
894         rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
895         if (rc)
896                 goto out_reserved;
897
898         rc = iopt_fill_domain(iopt, domain);
899         if (rc)
900                 goto out_release;
901
902         iopt->iova_alignment = new_iova_alignment;
903         xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
904         iopt->next_domain_id++;
905         up_write(&iopt->iova_rwsem);
906         up_write(&iopt->domains_rwsem);
907         return 0;
908 out_release:
909         xa_release(&iopt->domains, iopt->next_domain_id);
910 out_reserved:
911         __iopt_remove_reserved_iova(iopt, domain);
912 out_unlock:
913         up_write(&iopt->iova_rwsem);
914         up_write(&iopt->domains_rwsem);
915         return rc;
916 }
917
918 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
919 {
920         unsigned long new_iova_alignment;
921         struct iommufd_access *access;
922         struct iommu_domain *domain;
923         unsigned long index;
924
925         lockdep_assert_held_write(&iopt->iova_rwsem);
926         lockdep_assert_held(&iopt->domains_rwsem);
927
928         /* See batch_iommu_map_small() */
929         if (iopt->disable_large_pages)
930                 new_iova_alignment = PAGE_SIZE;
931         else
932                 new_iova_alignment = 1;
933
934         xa_for_each(&iopt->domains, index, domain)
935                 new_iova_alignment = max_t(unsigned long,
936                                            1UL << __ffs(domain->pgsize_bitmap),
937                                            new_iova_alignment);
938         xa_for_each(&iopt->access_list, index, access)
939                 new_iova_alignment = max_t(unsigned long,
940                                            access->iova_alignment,
941                                            new_iova_alignment);
942
943         if (new_iova_alignment > iopt->iova_alignment) {
944                 int rc;
945
946                 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
947                 if (rc)
948                         return rc;
949         }
950         iopt->iova_alignment = new_iova_alignment;
951         return 0;
952 }
953
954 void iopt_table_remove_domain(struct io_pagetable *iopt,
955                               struct iommu_domain *domain)
956 {
957         struct iommu_domain *iter_domain = NULL;
958         unsigned long index;
959
960         down_write(&iopt->domains_rwsem);
961         down_write(&iopt->iova_rwsem);
962
963         xa_for_each(&iopt->domains, index, iter_domain)
964                 if (iter_domain == domain)
965                         break;
966         if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
967                 goto out_unlock;
968
969         /*
970          * Compress the xarray to keep it linear by swapping the entry to erase
971          * with the tail entry and shrinking the tail.
972          */
973         iopt->next_domain_id--;
974         iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
975         if (index != iopt->next_domain_id)
976                 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
977
978         iopt_unfill_domain(iopt, domain);
979         __iopt_remove_reserved_iova(iopt, domain);
980
981         WARN_ON(iopt_calculate_iova_alignment(iopt));
982 out_unlock:
983         up_write(&iopt->iova_rwsem);
984         up_write(&iopt->domains_rwsem);
985 }
986
987 /**
988  * iopt_area_split - Split an area into two parts at iova
989  * @area: The area to split
990  * @iova: Becomes the last of a new area
991  *
992  * This splits an area into two. It is part of the VFIO compatibility to allow
993  * poking a hole in the mapping. The two areas continue to point at the same
994  * iopt_pages, just with different starting bytes.
995  */
996 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
997 {
998         unsigned long alignment = area->iopt->iova_alignment;
999         unsigned long last_iova = iopt_area_last_iova(area);
1000         unsigned long start_iova = iopt_area_iova(area);
1001         unsigned long new_start = iova + 1;
1002         struct io_pagetable *iopt = area->iopt;
1003         struct iopt_pages *pages = area->pages;
1004         struct iopt_area *lhs;
1005         struct iopt_area *rhs;
1006         int rc;
1007
1008         lockdep_assert_held_write(&iopt->iova_rwsem);
1009
1010         if (iova == start_iova || iova == last_iova)
1011                 return 0;
1012
1013         if (!pages || area->prevent_access)
1014                 return -EBUSY;
1015
1016         if (new_start & (alignment - 1) ||
1017             iopt_area_start_byte(area, new_start) & (alignment - 1))
1018                 return -EINVAL;
1019
1020         lhs = iopt_area_alloc();
1021         if (!lhs)
1022                 return -ENOMEM;
1023
1024         rhs = iopt_area_alloc();
1025         if (!rhs) {
1026                 rc = -ENOMEM;
1027                 goto err_free_lhs;
1028         }
1029
1030         mutex_lock(&pages->mutex);
1031         /*
1032          * Splitting is not permitted if an access exists, we don't track enough
1033          * information to split existing accesses.
1034          */
1035         if (area->num_accesses) {
1036                 rc = -EINVAL;
1037                 goto err_unlock;
1038         }
1039
1040         /*
1041          * Splitting is not permitted if a domain could have been mapped with
1042          * huge pages.
1043          */
1044         if (area->storage_domain && !iopt->disable_large_pages) {
1045                 rc = -EINVAL;
1046                 goto err_unlock;
1047         }
1048
1049         interval_tree_remove(&area->node, &iopt->area_itree);
1050         rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1051                               iopt_area_start_byte(area, start_iova),
1052                               (new_start - 1) - start_iova + 1,
1053                               area->iommu_prot);
1054         if (WARN_ON(rc))
1055                 goto err_insert;
1056
1057         rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1058                               iopt_area_start_byte(area, new_start),
1059                               last_iova - new_start + 1, area->iommu_prot);
1060         if (WARN_ON(rc))
1061                 goto err_remove_lhs;
1062
1063         lhs->storage_domain = area->storage_domain;
1064         lhs->pages = area->pages;
1065         rhs->storage_domain = area->storage_domain;
1066         rhs->pages = area->pages;
1067         kref_get(&rhs->pages->kref);
1068         kfree(area);
1069         mutex_unlock(&pages->mutex);
1070
1071         /*
1072          * No change to domains or accesses because the pages hasn't been
1073          * changed
1074          */
1075         return 0;
1076
1077 err_remove_lhs:
1078         interval_tree_remove(&lhs->node, &iopt->area_itree);
1079 err_insert:
1080         interval_tree_insert(&area->node, &iopt->area_itree);
1081 err_unlock:
1082         mutex_unlock(&pages->mutex);
1083         kfree(rhs);
1084 err_free_lhs:
1085         kfree(lhs);
1086         return rc;
1087 }
1088
1089 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1090                   size_t num_iovas)
1091 {
1092         int rc = 0;
1093         int i;
1094
1095         down_write(&iopt->iova_rwsem);
1096         for (i = 0; i < num_iovas; i++) {
1097                 struct iopt_area *area;
1098
1099                 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1100                 if (!area)
1101                         continue;
1102                 rc = iopt_area_split(area, iovas[i]);
1103                 if (rc)
1104                         break;
1105         }
1106         up_write(&iopt->iova_rwsem);
1107         return rc;
1108 }
1109
1110 void iopt_enable_large_pages(struct io_pagetable *iopt)
1111 {
1112         int rc;
1113
1114         down_write(&iopt->domains_rwsem);
1115         down_write(&iopt->iova_rwsem);
1116         WRITE_ONCE(iopt->disable_large_pages, false);
1117         rc = iopt_calculate_iova_alignment(iopt);
1118         WARN_ON(rc);
1119         up_write(&iopt->iova_rwsem);
1120         up_write(&iopt->domains_rwsem);
1121 }
1122
1123 int iopt_disable_large_pages(struct io_pagetable *iopt)
1124 {
1125         int rc = 0;
1126
1127         down_write(&iopt->domains_rwsem);
1128         down_write(&iopt->iova_rwsem);
1129         if (iopt->disable_large_pages)
1130                 goto out_unlock;
1131
1132         /* Won't do it if domains already have pages mapped in them */
1133         if (!xa_empty(&iopt->domains) &&
1134             !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1135                 rc = -EINVAL;
1136                 goto out_unlock;
1137         }
1138
1139         WRITE_ONCE(iopt->disable_large_pages, true);
1140         rc = iopt_calculate_iova_alignment(iopt);
1141         if (rc)
1142                 WRITE_ONCE(iopt->disable_large_pages, false);
1143 out_unlock:
1144         up_write(&iopt->iova_rwsem);
1145         up_write(&iopt->domains_rwsem);
1146         return rc;
1147 }
1148
1149 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1150 {
1151         int rc;
1152
1153         down_write(&iopt->domains_rwsem);
1154         down_write(&iopt->iova_rwsem);
1155         rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access,
1156                       xa_limit_16b, GFP_KERNEL_ACCOUNT);
1157         if (rc)
1158                 goto out_unlock;
1159
1160         rc = iopt_calculate_iova_alignment(iopt);
1161         if (rc) {
1162                 xa_erase(&iopt->access_list, access->iopt_access_list_id);
1163                 goto out_unlock;
1164         }
1165
1166 out_unlock:
1167         up_write(&iopt->iova_rwsem);
1168         up_write(&iopt->domains_rwsem);
1169         return rc;
1170 }
1171
1172 void iopt_remove_access(struct io_pagetable *iopt,
1173                         struct iommufd_access *access,
1174                         u32 iopt_access_list_id)
1175 {
1176         down_write(&iopt->domains_rwsem);
1177         down_write(&iopt->iova_rwsem);
1178         WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1179         WARN_ON(iopt_calculate_iova_alignment(iopt));
1180         up_write(&iopt->iova_rwsem);
1181         up_write(&iopt->domains_rwsem);
1182 }
1183
1184 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1185 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1186                                         struct device *dev,
1187                                         phys_addr_t *sw_msi_start)
1188 {
1189         struct iommu_resv_region *resv;
1190         LIST_HEAD(resv_regions);
1191         unsigned int num_hw_msi = 0;
1192         unsigned int num_sw_msi = 0;
1193         int rc;
1194
1195         if (iommufd_should_fail())
1196                 return -EINVAL;
1197
1198         down_write(&iopt->iova_rwsem);
1199         /* FIXME: drivers allocate memory but there is no failure propogated */
1200         iommu_get_resv_regions(dev, &resv_regions);
1201
1202         list_for_each_entry(resv, &resv_regions, list) {
1203                 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1204                         continue;
1205
1206                 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1207                         num_hw_msi++;
1208                 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1209                         *sw_msi_start = resv->start;
1210                         num_sw_msi++;
1211                 }
1212
1213                 rc = iopt_reserve_iova(iopt, resv->start,
1214                                        resv->length - 1 + resv->start, dev);
1215                 if (rc)
1216                         goto out_reserved;
1217         }
1218
1219         /* Drivers must offer sane combinations of regions */
1220         if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1221                 rc = -EINVAL;
1222                 goto out_reserved;
1223         }
1224
1225         rc = 0;
1226         goto out_free_resv;
1227
1228 out_reserved:
1229         __iopt_remove_reserved_iova(iopt, dev);
1230 out_free_resv:
1231         iommu_put_resv_regions(dev, &resv_regions);
1232         up_write(&iopt->iova_rwsem);
1233         return rc;
1234 }