Merge tag 'xfs-6.4-rc1-fixes' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
[platform/kernel/linux-starfive.git] / drivers / vfio / vfio_iommu_spapr_tce.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO: IOMMU DMA mapping support for TCE on POWER
4  *
5  * Copyright (C) 2013 IBM Corp.  All rights reserved.
6  *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
7  * Copyright Gavin Shan, IBM Corporation 2014.
8  *
9  * Derived from original vfio_iommu_type1.c:
10  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
11  *     Author: Alex Williamson <alex.williamson@redhat.com>
12  */
13
14 #include <linux/module.h>
15 #include <linux/pci.h>
16 #include <linux/slab.h>
17 #include <linux/uaccess.h>
18 #include <linux/err.h>
19 #include <linux/vfio.h>
20 #include <linux/vmalloc.h>
21 #include <linux/sched/mm.h>
22 #include <linux/sched/signal.h>
23 #include <linux/mm.h>
24 #include "vfio.h"
25
26 #include <asm/iommu.h>
27 #include <asm/tce.h>
28 #include <asm/mmu_context.h>
29
30 #define DRIVER_VERSION  "0.1"
31 #define DRIVER_AUTHOR   "aik@ozlabs.ru"
32 #define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
33
34 static void tce_iommu_detach_group(void *iommu_data,
35                 struct iommu_group *iommu_group);
36
37 /*
38  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
39  *
40  * This code handles mapping and unmapping of user data buffers
41  * into DMA'ble space using the IOMMU
42  */
43
44 struct tce_iommu_group {
45         struct list_head next;
46         struct iommu_group *grp;
47 };
48
49 /*
50  * A container needs to remember which preregistered region  it has
51  * referenced to do proper cleanup at the userspace process exit.
52  */
53 struct tce_iommu_prereg {
54         struct list_head next;
55         struct mm_iommu_table_group_mem_t *mem;
56 };
57
58 /*
59  * The container descriptor supports only a single group per container.
60  * Required by the API as the container is not supplied with the IOMMU group
61  * at the moment of initialization.
62  */
63 struct tce_container {
64         struct mutex lock;
65         bool enabled;
66         bool v2;
67         bool def_window_pending;
68         unsigned long locked_pages;
69         struct mm_struct *mm;
70         struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
71         struct list_head group_list;
72         struct list_head prereg_list;
73 };
74
75 static long tce_iommu_mm_set(struct tce_container *container)
76 {
77         if (container->mm) {
78                 if (container->mm == current->mm)
79                         return 0;
80                 return -EPERM;
81         }
82         BUG_ON(!current->mm);
83         container->mm = current->mm;
84         mmgrab(container->mm);
85
86         return 0;
87 }
88
89 static long tce_iommu_prereg_free(struct tce_container *container,
90                 struct tce_iommu_prereg *tcemem)
91 {
92         long ret;
93
94         ret = mm_iommu_put(container->mm, tcemem->mem);
95         if (ret)
96                 return ret;
97
98         list_del(&tcemem->next);
99         kfree(tcemem);
100
101         return 0;
102 }
103
104 static long tce_iommu_unregister_pages(struct tce_container *container,
105                 __u64 vaddr, __u64 size)
106 {
107         struct mm_iommu_table_group_mem_t *mem;
108         struct tce_iommu_prereg *tcemem;
109         bool found = false;
110         long ret;
111
112         if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
113                 return -EINVAL;
114
115         mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
116         if (!mem)
117                 return -ENOENT;
118
119         list_for_each_entry(tcemem, &container->prereg_list, next) {
120                 if (tcemem->mem == mem) {
121                         found = true;
122                         break;
123                 }
124         }
125
126         if (!found)
127                 ret = -ENOENT;
128         else
129                 ret = tce_iommu_prereg_free(container, tcemem);
130
131         mm_iommu_put(container->mm, mem);
132
133         return ret;
134 }
135
136 static long tce_iommu_register_pages(struct tce_container *container,
137                 __u64 vaddr, __u64 size)
138 {
139         long ret = 0;
140         struct mm_iommu_table_group_mem_t *mem = NULL;
141         struct tce_iommu_prereg *tcemem;
142         unsigned long entries = size >> PAGE_SHIFT;
143
144         if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
145                         ((vaddr + size) < vaddr))
146                 return -EINVAL;
147
148         mem = mm_iommu_get(container->mm, vaddr, entries);
149         if (mem) {
150                 list_for_each_entry(tcemem, &container->prereg_list, next) {
151                         if (tcemem->mem == mem) {
152                                 ret = -EBUSY;
153                                 goto put_exit;
154                         }
155                 }
156         } else {
157                 ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
158                 if (ret)
159                         return ret;
160         }
161
162         tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
163         if (!tcemem) {
164                 ret = -ENOMEM;
165                 goto put_exit;
166         }
167
168         tcemem->mem = mem;
169         list_add(&tcemem->next, &container->prereg_list);
170
171         container->enabled = true;
172
173         return 0;
174
175 put_exit:
176         mm_iommu_put(container->mm, mem);
177         return ret;
178 }
179
180 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
181                 unsigned int it_page_shift)
182 {
183         struct page *page;
184         unsigned long size = 0;
185
186         if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
187                 return size == (1UL << it_page_shift);
188
189         page = pfn_to_page(hpa >> PAGE_SHIFT);
190         /*
191          * Check that the TCE table granularity is not bigger than the size of
192          * a page we just found. Otherwise the hardware can get access to
193          * a bigger memory chunk that it should.
194          */
195         return page_shift(compound_head(page)) >= it_page_shift;
196 }
197
198 static inline bool tce_groups_attached(struct tce_container *container)
199 {
200         return !list_empty(&container->group_list);
201 }
202
203 static long tce_iommu_find_table(struct tce_container *container,
204                 phys_addr_t ioba, struct iommu_table **ptbl)
205 {
206         long i;
207
208         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
209                 struct iommu_table *tbl = container->tables[i];
210
211                 if (tbl) {
212                         unsigned long entry = ioba >> tbl->it_page_shift;
213                         unsigned long start = tbl->it_offset;
214                         unsigned long end = start + tbl->it_size;
215
216                         if ((start <= entry) && (entry < end)) {
217                                 *ptbl = tbl;
218                                 return i;
219                         }
220                 }
221         }
222
223         return -1;
224 }
225
226 static int tce_iommu_find_free_table(struct tce_container *container)
227 {
228         int i;
229
230         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
231                 if (!container->tables[i])
232                         return i;
233         }
234
235         return -ENOSPC;
236 }
237
238 static int tce_iommu_enable(struct tce_container *container)
239 {
240         int ret = 0;
241         unsigned long locked;
242         struct iommu_table_group *table_group;
243         struct tce_iommu_group *tcegrp;
244
245         if (container->enabled)
246                 return -EBUSY;
247
248         /*
249          * When userspace pages are mapped into the IOMMU, they are effectively
250          * locked memory, so, theoretically, we need to update the accounting
251          * of locked pages on each map and unmap.  For powerpc, the map unmap
252          * paths can be very hot, though, and the accounting would kill
253          * performance, especially since it would be difficult to impossible
254          * to handle the accounting in real mode only.
255          *
256          * To address that, rather than precisely accounting every page, we
257          * instead account for a worst case on locked memory when the iommu is
258          * enabled and disabled.  The worst case upper bound on locked memory
259          * is the size of the whole iommu window, which is usually relatively
260          * small (compared to total memory sizes) on POWER hardware.
261          *
262          * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
263          * that would effectively kill the guest at random points, much better
264          * enforcing the limit based on the max that the guest can map.
265          *
266          * Unfortunately at the moment it counts whole tables, no matter how
267          * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
268          * each with 2GB DMA window, 8GB will be counted here. The reason for
269          * this is that we cannot tell here the amount of RAM used by the guest
270          * as this information is only available from KVM and VFIO is
271          * KVM agnostic.
272          *
273          * So we do not allow enabling a container without a group attached
274          * as there is no way to know how much we should increment
275          * the locked_vm counter.
276          */
277         if (!tce_groups_attached(container))
278                 return -ENODEV;
279
280         tcegrp = list_first_entry(&container->group_list,
281                         struct tce_iommu_group, next);
282         table_group = iommu_group_get_iommudata(tcegrp->grp);
283         if (!table_group)
284                 return -ENODEV;
285
286         if (!table_group->tce32_size)
287                 return -EPERM;
288
289         ret = tce_iommu_mm_set(container);
290         if (ret)
291                 return ret;
292
293         locked = table_group->tce32_size >> PAGE_SHIFT;
294         ret = account_locked_vm(container->mm, locked, true);
295         if (ret)
296                 return ret;
297
298         container->locked_pages = locked;
299
300         container->enabled = true;
301
302         return ret;
303 }
304
305 static void tce_iommu_disable(struct tce_container *container)
306 {
307         if (!container->enabled)
308                 return;
309
310         container->enabled = false;
311
312         BUG_ON(!container->mm);
313         account_locked_vm(container->mm, container->locked_pages, false);
314 }
315
316 static void *tce_iommu_open(unsigned long arg)
317 {
318         struct tce_container *container;
319
320         if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
321                 pr_err("tce_vfio: Wrong IOMMU type\n");
322                 return ERR_PTR(-EINVAL);
323         }
324
325         container = kzalloc(sizeof(*container), GFP_KERNEL);
326         if (!container)
327                 return ERR_PTR(-ENOMEM);
328
329         mutex_init(&container->lock);
330         INIT_LIST_HEAD_RCU(&container->group_list);
331         INIT_LIST_HEAD_RCU(&container->prereg_list);
332
333         container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
334
335         return container;
336 }
337
338 static int tce_iommu_clear(struct tce_container *container,
339                 struct iommu_table *tbl,
340                 unsigned long entry, unsigned long pages);
341 static void tce_iommu_free_table(struct tce_container *container,
342                 struct iommu_table *tbl);
343
344 static void tce_iommu_release(void *iommu_data)
345 {
346         struct tce_container *container = iommu_data;
347         struct tce_iommu_group *tcegrp;
348         struct tce_iommu_prereg *tcemem, *tmtmp;
349         long i;
350
351         while (tce_groups_attached(container)) {
352                 tcegrp = list_first_entry(&container->group_list,
353                                 struct tce_iommu_group, next);
354                 tce_iommu_detach_group(iommu_data, tcegrp->grp);
355         }
356
357         /*
358          * If VFIO created a table, it was not disposed
359          * by tce_iommu_detach_group() so do it now.
360          */
361         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
362                 struct iommu_table *tbl = container->tables[i];
363
364                 if (!tbl)
365                         continue;
366
367                 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
368                 tce_iommu_free_table(container, tbl);
369         }
370
371         list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
372                 WARN_ON(tce_iommu_prereg_free(container, tcemem));
373
374         tce_iommu_disable(container);
375         if (container->mm)
376                 mmdrop(container->mm);
377         mutex_destroy(&container->lock);
378
379         kfree(container);
380 }
381
382 static void tce_iommu_unuse_page(unsigned long hpa)
383 {
384         struct page *page;
385
386         page = pfn_to_page(hpa >> PAGE_SHIFT);
387         unpin_user_page(page);
388 }
389
390 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
391                 unsigned long tce, unsigned long shift,
392                 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
393 {
394         long ret = 0;
395         struct mm_iommu_table_group_mem_t *mem;
396
397         mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
398         if (!mem)
399                 return -EINVAL;
400
401         ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
402         if (ret)
403                 return -EINVAL;
404
405         *pmem = mem;
406
407         return 0;
408 }
409
410 static void tce_iommu_unuse_page_v2(struct tce_container *container,
411                 struct iommu_table *tbl, unsigned long entry)
412 {
413         struct mm_iommu_table_group_mem_t *mem = NULL;
414         int ret;
415         unsigned long hpa = 0;
416         __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
417
418         if (!pua)
419                 return;
420
421         ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
422                         tbl->it_page_shift, &hpa, &mem);
423         if (ret)
424                 pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
425                                 __func__, be64_to_cpu(*pua), entry, ret);
426         if (mem)
427                 mm_iommu_mapped_dec(mem);
428
429         *pua = cpu_to_be64(0);
430 }
431
432 static int tce_iommu_clear(struct tce_container *container,
433                 struct iommu_table *tbl,
434                 unsigned long entry, unsigned long pages)
435 {
436         unsigned long oldhpa;
437         long ret;
438         enum dma_data_direction direction;
439         unsigned long lastentry = entry + pages, firstentry = entry;
440
441         for ( ; entry < lastentry; ++entry) {
442                 if (tbl->it_indirect_levels && tbl->it_userspace) {
443                         /*
444                          * For multilevel tables, we can take a shortcut here
445                          * and skip some TCEs as we know that the userspace
446                          * addresses cache is a mirror of the real TCE table
447                          * and if it is missing some indirect levels, then
448                          * the hardware table does not have them allocated
449                          * either and therefore does not require updating.
450                          */
451                         __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
452                                         entry);
453                         if (!pua) {
454                                 /* align to level_size which is power of two */
455                                 entry |= tbl->it_level_size - 1;
456                                 continue;
457                         }
458                 }
459
460                 cond_resched();
461
462                 direction = DMA_NONE;
463                 oldhpa = 0;
464                 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
465                                 &direction);
466                 if (ret)
467                         continue;
468
469                 if (direction == DMA_NONE)
470                         continue;
471
472                 if (container->v2) {
473                         tce_iommu_unuse_page_v2(container, tbl, entry);
474                         continue;
475                 }
476
477                 tce_iommu_unuse_page(oldhpa);
478         }
479
480         iommu_tce_kill(tbl, firstentry, pages);
481
482         return 0;
483 }
484
485 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
486 {
487         struct page *page = NULL;
488         enum dma_data_direction direction = iommu_tce_direction(tce);
489
490         if (pin_user_pages_fast(tce & PAGE_MASK, 1,
491                         direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
492                         &page) != 1)
493                 return -EFAULT;
494
495         *hpa = __pa((unsigned long) page_address(page));
496
497         return 0;
498 }
499
500 static long tce_iommu_build(struct tce_container *container,
501                 struct iommu_table *tbl,
502                 unsigned long entry, unsigned long tce, unsigned long pages,
503                 enum dma_data_direction direction)
504 {
505         long i, ret = 0;
506         unsigned long hpa;
507         enum dma_data_direction dirtmp;
508
509         for (i = 0; i < pages; ++i) {
510                 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
511
512                 ret = tce_iommu_use_page(tce, &hpa);
513                 if (ret)
514                         break;
515
516                 if (!tce_page_is_contained(container->mm, hpa,
517                                 tbl->it_page_shift)) {
518                         ret = -EPERM;
519                         break;
520                 }
521
522                 hpa |= offset;
523                 dirtmp = direction;
524                 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
525                                 &hpa, &dirtmp);
526                 if (ret) {
527                         tce_iommu_unuse_page(hpa);
528                         pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
529                                         __func__, entry << tbl->it_page_shift,
530                                         tce, ret);
531                         break;
532                 }
533
534                 if (dirtmp != DMA_NONE)
535                         tce_iommu_unuse_page(hpa);
536
537                 tce += IOMMU_PAGE_SIZE(tbl);
538         }
539
540         if (ret)
541                 tce_iommu_clear(container, tbl, entry, i);
542         else
543                 iommu_tce_kill(tbl, entry, pages);
544
545         return ret;
546 }
547
548 static long tce_iommu_build_v2(struct tce_container *container,
549                 struct iommu_table *tbl,
550                 unsigned long entry, unsigned long tce, unsigned long pages,
551                 enum dma_data_direction direction)
552 {
553         long i, ret = 0;
554         unsigned long hpa;
555         enum dma_data_direction dirtmp;
556
557         for (i = 0; i < pages; ++i) {
558                 struct mm_iommu_table_group_mem_t *mem = NULL;
559                 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
560
561                 ret = tce_iommu_prereg_ua_to_hpa(container,
562                                 tce, tbl->it_page_shift, &hpa, &mem);
563                 if (ret)
564                         break;
565
566                 if (!tce_page_is_contained(container->mm, hpa,
567                                 tbl->it_page_shift)) {
568                         ret = -EPERM;
569                         break;
570                 }
571
572                 /* Preserve offset within IOMMU page */
573                 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
574                 dirtmp = direction;
575
576                 /* The registered region is being unregistered */
577                 if (mm_iommu_mapped_inc(mem))
578                         break;
579
580                 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
581                                 &hpa, &dirtmp);
582                 if (ret) {
583                         /* dirtmp cannot be DMA_NONE here */
584                         tce_iommu_unuse_page_v2(container, tbl, entry + i);
585                         pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
586                                         __func__, entry << tbl->it_page_shift,
587                                         tce, ret);
588                         break;
589                 }
590
591                 if (dirtmp != DMA_NONE)
592                         tce_iommu_unuse_page_v2(container, tbl, entry + i);
593
594                 *pua = cpu_to_be64(tce);
595
596                 tce += IOMMU_PAGE_SIZE(tbl);
597         }
598
599         if (ret)
600                 tce_iommu_clear(container, tbl, entry, i);
601         else
602                 iommu_tce_kill(tbl, entry, pages);
603
604         return ret;
605 }
606
607 static long tce_iommu_create_table(struct tce_container *container,
608                         struct iommu_table_group *table_group,
609                         int num,
610                         __u32 page_shift,
611                         __u64 window_size,
612                         __u32 levels,
613                         struct iommu_table **ptbl)
614 {
615         long ret, table_size;
616
617         table_size = table_group->ops->get_table_size(page_shift, window_size,
618                         levels);
619         if (!table_size)
620                 return -EINVAL;
621
622         ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
623         if (ret)
624                 return ret;
625
626         ret = table_group->ops->create_table(table_group, num,
627                         page_shift, window_size, levels, ptbl);
628
629         WARN_ON(!ret && !(*ptbl)->it_ops->free);
630         WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
631
632         return ret;
633 }
634
635 static void tce_iommu_free_table(struct tce_container *container,
636                 struct iommu_table *tbl)
637 {
638         unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
639
640         iommu_tce_table_put(tbl);
641         account_locked_vm(container->mm, pages, false);
642 }
643
644 static long tce_iommu_create_window(struct tce_container *container,
645                 __u32 page_shift, __u64 window_size, __u32 levels,
646                 __u64 *start_addr)
647 {
648         struct tce_iommu_group *tcegrp;
649         struct iommu_table_group *table_group;
650         struct iommu_table *tbl = NULL;
651         long ret, num;
652
653         num = tce_iommu_find_free_table(container);
654         if (num < 0)
655                 return num;
656
657         /* Get the first group for ops::create_table */
658         tcegrp = list_first_entry(&container->group_list,
659                         struct tce_iommu_group, next);
660         table_group = iommu_group_get_iommudata(tcegrp->grp);
661         if (!table_group)
662                 return -EFAULT;
663
664         if (!(table_group->pgsizes & (1ULL << page_shift)))
665                 return -EINVAL;
666
667         if (!table_group->ops->set_window || !table_group->ops->unset_window ||
668                         !table_group->ops->get_table_size ||
669                         !table_group->ops->create_table)
670                 return -EPERM;
671
672         /* Create TCE table */
673         ret = tce_iommu_create_table(container, table_group, num,
674                         page_shift, window_size, levels, &tbl);
675         if (ret)
676                 return ret;
677
678         BUG_ON(!tbl->it_ops->free);
679
680         /*
681          * Program the table to every group.
682          * Groups have been tested for compatibility at the attach time.
683          */
684         list_for_each_entry(tcegrp, &container->group_list, next) {
685                 table_group = iommu_group_get_iommudata(tcegrp->grp);
686
687                 ret = table_group->ops->set_window(table_group, num, tbl);
688                 if (ret)
689                         goto unset_exit;
690         }
691
692         container->tables[num] = tbl;
693
694         /* Return start address assigned by platform in create_table() */
695         *start_addr = tbl->it_offset << tbl->it_page_shift;
696
697         return 0;
698
699 unset_exit:
700         list_for_each_entry(tcegrp, &container->group_list, next) {
701                 table_group = iommu_group_get_iommudata(tcegrp->grp);
702                 table_group->ops->unset_window(table_group, num);
703         }
704         tce_iommu_free_table(container, tbl);
705
706         return ret;
707 }
708
709 static long tce_iommu_remove_window(struct tce_container *container,
710                 __u64 start_addr)
711 {
712         struct iommu_table_group *table_group = NULL;
713         struct iommu_table *tbl;
714         struct tce_iommu_group *tcegrp;
715         int num;
716
717         num = tce_iommu_find_table(container, start_addr, &tbl);
718         if (num < 0)
719                 return -EINVAL;
720
721         BUG_ON(!tbl->it_size);
722
723         /* Detach groups from IOMMUs */
724         list_for_each_entry(tcegrp, &container->group_list, next) {
725                 table_group = iommu_group_get_iommudata(tcegrp->grp);
726
727                 /*
728                  * SPAPR TCE IOMMU exposes the default DMA window to
729                  * the guest via dma32_window_start/size of
730                  * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
731                  * the userspace to remove this window, some do not so
732                  * here we check for the platform capability.
733                  */
734                 if (!table_group->ops || !table_group->ops->unset_window)
735                         return -EPERM;
736
737                 table_group->ops->unset_window(table_group, num);
738         }
739
740         /* Free table */
741         tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
742         tce_iommu_free_table(container, tbl);
743         container->tables[num] = NULL;
744
745         return 0;
746 }
747
748 static long tce_iommu_create_default_window(struct tce_container *container)
749 {
750         long ret;
751         __u64 start_addr = 0;
752         struct tce_iommu_group *tcegrp;
753         struct iommu_table_group *table_group;
754
755         if (!container->def_window_pending)
756                 return 0;
757
758         if (!tce_groups_attached(container))
759                 return -ENODEV;
760
761         tcegrp = list_first_entry(&container->group_list,
762                         struct tce_iommu_group, next);
763         table_group = iommu_group_get_iommudata(tcegrp->grp);
764         if (!table_group)
765                 return -ENODEV;
766
767         ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
768                         table_group->tce32_size, 1, &start_addr);
769         WARN_ON_ONCE(!ret && start_addr);
770
771         if (!ret)
772                 container->def_window_pending = false;
773
774         return ret;
775 }
776
777 static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group,
778                                        unsigned long arg)
779 {
780         struct eeh_pe *pe;
781         struct vfio_eeh_pe_op op;
782         unsigned long minsz;
783
784         pe = eeh_iommu_group_to_pe(group);
785         if (!pe)
786                 return -ENODEV;
787
788         minsz = offsetofend(struct vfio_eeh_pe_op, op);
789         if (copy_from_user(&op, (void __user *)arg, minsz))
790                 return -EFAULT;
791         if (op.argsz < minsz || op.flags)
792                 return -EINVAL;
793
794         switch (op.op) {
795         case VFIO_EEH_PE_DISABLE:
796                 return eeh_pe_set_option(pe, EEH_OPT_DISABLE);
797         case VFIO_EEH_PE_ENABLE:
798                 return eeh_pe_set_option(pe, EEH_OPT_ENABLE);
799         case VFIO_EEH_PE_UNFREEZE_IO:
800                 return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
801         case VFIO_EEH_PE_UNFREEZE_DMA:
802                 return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
803         case VFIO_EEH_PE_GET_STATE:
804                 return eeh_pe_get_state(pe);
805                 break;
806         case VFIO_EEH_PE_RESET_DEACTIVATE:
807                 return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true);
808         case VFIO_EEH_PE_RESET_HOT:
809                 return eeh_pe_reset(pe, EEH_RESET_HOT, true);
810         case VFIO_EEH_PE_RESET_FUNDAMENTAL:
811                 return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true);
812         case VFIO_EEH_PE_CONFIGURE:
813                 return eeh_pe_configure(pe);
814         case VFIO_EEH_PE_INJECT_ERR:
815                 minsz = offsetofend(struct vfio_eeh_pe_op, err.mask);
816                 if (op.argsz < minsz)
817                         return -EINVAL;
818                 if (copy_from_user(&op, (void __user *)arg, minsz))
819                         return -EFAULT;
820
821                 return eeh_pe_inject_err(pe, op.err.type, op.err.func,
822                                          op.err.addr, op.err.mask);
823         default:
824                 return -EINVAL;
825         }
826 }
827
828 static long tce_iommu_ioctl(void *iommu_data,
829                                  unsigned int cmd, unsigned long arg)
830 {
831         struct tce_container *container = iommu_data;
832         unsigned long minsz, ddwsz;
833         long ret;
834
835         switch (cmd) {
836         case VFIO_CHECK_EXTENSION:
837                 switch (arg) {
838                 case VFIO_SPAPR_TCE_IOMMU:
839                 case VFIO_SPAPR_TCE_v2_IOMMU:
840                         return 1;
841                 case VFIO_EEH:
842                         return eeh_enabled();
843                 default:
844                         return 0;
845                 }
846         }
847
848         /*
849          * Sanity check to prevent one userspace from manipulating
850          * another userspace mm.
851          */
852         BUG_ON(!container);
853         if (container->mm && container->mm != current->mm)
854                 return -EPERM;
855
856         switch (cmd) {
857         case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
858                 struct vfio_iommu_spapr_tce_info info;
859                 struct tce_iommu_group *tcegrp;
860                 struct iommu_table_group *table_group;
861
862                 if (!tce_groups_attached(container))
863                         return -ENXIO;
864
865                 tcegrp = list_first_entry(&container->group_list,
866                                 struct tce_iommu_group, next);
867                 table_group = iommu_group_get_iommudata(tcegrp->grp);
868
869                 if (!table_group)
870                         return -ENXIO;
871
872                 minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
873                                 dma32_window_size);
874
875                 if (copy_from_user(&info, (void __user *)arg, minsz))
876                         return -EFAULT;
877
878                 if (info.argsz < minsz)
879                         return -EINVAL;
880
881                 info.dma32_window_start = table_group->tce32_start;
882                 info.dma32_window_size = table_group->tce32_size;
883                 info.flags = 0;
884                 memset(&info.ddw, 0, sizeof(info.ddw));
885
886                 if (table_group->max_dynamic_windows_supported &&
887                                 container->v2) {
888                         info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
889                         info.ddw.pgsizes = table_group->pgsizes;
890                         info.ddw.max_dynamic_windows_supported =
891                                 table_group->max_dynamic_windows_supported;
892                         info.ddw.levels = table_group->max_levels;
893                 }
894
895                 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
896
897                 if (info.argsz >= ddwsz)
898                         minsz = ddwsz;
899
900                 if (copy_to_user((void __user *)arg, &info, minsz))
901                         return -EFAULT;
902
903                 return 0;
904         }
905         case VFIO_IOMMU_MAP_DMA: {
906                 struct vfio_iommu_type1_dma_map param;
907                 struct iommu_table *tbl = NULL;
908                 long num;
909                 enum dma_data_direction direction;
910
911                 if (!container->enabled)
912                         return -EPERM;
913
914                 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
915
916                 if (copy_from_user(&param, (void __user *)arg, minsz))
917                         return -EFAULT;
918
919                 if (param.argsz < minsz)
920                         return -EINVAL;
921
922                 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
923                                 VFIO_DMA_MAP_FLAG_WRITE))
924                         return -EINVAL;
925
926                 ret = tce_iommu_create_default_window(container);
927                 if (ret)
928                         return ret;
929
930                 num = tce_iommu_find_table(container, param.iova, &tbl);
931                 if (num < 0)
932                         return -ENXIO;
933
934                 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
935                                 (param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
936                         return -EINVAL;
937
938                 /* iova is checked by the IOMMU API */
939                 if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
940                         if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
941                                 direction = DMA_BIDIRECTIONAL;
942                         else
943                                 direction = DMA_TO_DEVICE;
944                 } else {
945                         if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
946                                 direction = DMA_FROM_DEVICE;
947                         else
948                                 return -EINVAL;
949                 }
950
951                 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
952                 if (ret)
953                         return ret;
954
955                 if (container->v2)
956                         ret = tce_iommu_build_v2(container, tbl,
957                                         param.iova >> tbl->it_page_shift,
958                                         param.vaddr,
959                                         param.size >> tbl->it_page_shift,
960                                         direction);
961                 else
962                         ret = tce_iommu_build(container, tbl,
963                                         param.iova >> tbl->it_page_shift,
964                                         param.vaddr,
965                                         param.size >> tbl->it_page_shift,
966                                         direction);
967
968                 iommu_flush_tce(tbl);
969
970                 return ret;
971         }
972         case VFIO_IOMMU_UNMAP_DMA: {
973                 struct vfio_iommu_type1_dma_unmap param;
974                 struct iommu_table *tbl = NULL;
975                 long num;
976
977                 if (!container->enabled)
978                         return -EPERM;
979
980                 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
981                                 size);
982
983                 if (copy_from_user(&param, (void __user *)arg, minsz))
984                         return -EFAULT;
985
986                 if (param.argsz < minsz)
987                         return -EINVAL;
988
989                 /* No flag is supported now */
990                 if (param.flags)
991                         return -EINVAL;
992
993                 ret = tce_iommu_create_default_window(container);
994                 if (ret)
995                         return ret;
996
997                 num = tce_iommu_find_table(container, param.iova, &tbl);
998                 if (num < 0)
999                         return -ENXIO;
1000
1001                 if (param.size & ~IOMMU_PAGE_MASK(tbl))
1002                         return -EINVAL;
1003
1004                 ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
1005                                 param.size >> tbl->it_page_shift);
1006                 if (ret)
1007                         return ret;
1008
1009                 ret = tce_iommu_clear(container, tbl,
1010                                 param.iova >> tbl->it_page_shift,
1011                                 param.size >> tbl->it_page_shift);
1012                 iommu_flush_tce(tbl);
1013
1014                 return ret;
1015         }
1016         case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
1017                 struct vfio_iommu_spapr_register_memory param;
1018
1019                 if (!container->v2)
1020                         break;
1021
1022                 minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1023                                 size);
1024
1025                 ret = tce_iommu_mm_set(container);
1026                 if (ret)
1027                         return ret;
1028
1029                 if (copy_from_user(&param, (void __user *)arg, minsz))
1030                         return -EFAULT;
1031
1032                 if (param.argsz < minsz)
1033                         return -EINVAL;
1034
1035                 /* No flag is supported now */
1036                 if (param.flags)
1037                         return -EINVAL;
1038
1039                 mutex_lock(&container->lock);
1040                 ret = tce_iommu_register_pages(container, param.vaddr,
1041                                 param.size);
1042                 mutex_unlock(&container->lock);
1043
1044                 return ret;
1045         }
1046         case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
1047                 struct vfio_iommu_spapr_register_memory param;
1048
1049                 if (!container->v2)
1050                         break;
1051
1052                 if (!container->mm)
1053                         return -EPERM;
1054
1055                 minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1056                                 size);
1057
1058                 if (copy_from_user(&param, (void __user *)arg, minsz))
1059                         return -EFAULT;
1060
1061                 if (param.argsz < minsz)
1062                         return -EINVAL;
1063
1064                 /* No flag is supported now */
1065                 if (param.flags)
1066                         return -EINVAL;
1067
1068                 mutex_lock(&container->lock);
1069                 ret = tce_iommu_unregister_pages(container, param.vaddr,
1070                                 param.size);
1071                 mutex_unlock(&container->lock);
1072
1073                 return ret;
1074         }
1075         case VFIO_IOMMU_ENABLE:
1076                 if (container->v2)
1077                         break;
1078
1079                 mutex_lock(&container->lock);
1080                 ret = tce_iommu_enable(container);
1081                 mutex_unlock(&container->lock);
1082                 return ret;
1083
1084
1085         case VFIO_IOMMU_DISABLE:
1086                 if (container->v2)
1087                         break;
1088
1089                 mutex_lock(&container->lock);
1090                 tce_iommu_disable(container);
1091                 mutex_unlock(&container->lock);
1092                 return 0;
1093
1094         case VFIO_EEH_PE_OP: {
1095                 struct tce_iommu_group *tcegrp;
1096
1097                 ret = 0;
1098                 list_for_each_entry(tcegrp, &container->group_list, next) {
1099                         ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg);
1100                         if (ret)
1101                                 return ret;
1102                 }
1103                 return ret;
1104         }
1105
1106         case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1107                 struct vfio_iommu_spapr_tce_create create;
1108
1109                 if (!container->v2)
1110                         break;
1111
1112                 ret = tce_iommu_mm_set(container);
1113                 if (ret)
1114                         return ret;
1115
1116                 if (!tce_groups_attached(container))
1117                         return -ENXIO;
1118
1119                 minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1120                                 start_addr);
1121
1122                 if (copy_from_user(&create, (void __user *)arg, minsz))
1123                         return -EFAULT;
1124
1125                 if (create.argsz < minsz)
1126                         return -EINVAL;
1127
1128                 if (create.flags)
1129                         return -EINVAL;
1130
1131                 mutex_lock(&container->lock);
1132
1133                 ret = tce_iommu_create_default_window(container);
1134                 if (!ret)
1135                         ret = tce_iommu_create_window(container,
1136                                         create.page_shift,
1137                                         create.window_size, create.levels,
1138                                         &create.start_addr);
1139
1140                 mutex_unlock(&container->lock);
1141
1142                 if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1143                         ret = -EFAULT;
1144
1145                 return ret;
1146         }
1147         case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1148                 struct vfio_iommu_spapr_tce_remove remove;
1149
1150                 if (!container->v2)
1151                         break;
1152
1153                 ret = tce_iommu_mm_set(container);
1154                 if (ret)
1155                         return ret;
1156
1157                 if (!tce_groups_attached(container))
1158                         return -ENXIO;
1159
1160                 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1161                                 start_addr);
1162
1163                 if (copy_from_user(&remove, (void __user *)arg, minsz))
1164                         return -EFAULT;
1165
1166                 if (remove.argsz < minsz)
1167                         return -EINVAL;
1168
1169                 if (remove.flags)
1170                         return -EINVAL;
1171
1172                 if (container->def_window_pending && !remove.start_addr) {
1173                         container->def_window_pending = false;
1174                         return 0;
1175                 }
1176
1177                 mutex_lock(&container->lock);
1178
1179                 ret = tce_iommu_remove_window(container, remove.start_addr);
1180
1181                 mutex_unlock(&container->lock);
1182
1183                 return ret;
1184         }
1185         }
1186
1187         return -ENOTTY;
1188 }
1189
1190 static void tce_iommu_release_ownership(struct tce_container *container,
1191                 struct iommu_table_group *table_group)
1192 {
1193         long i;
1194
1195         if (!table_group->ops->unset_window) {
1196                 WARN_ON_ONCE(1);
1197                 return;
1198         }
1199
1200         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1201                 if (container->tables[i])
1202                         table_group->ops->unset_window(table_group, i);
1203 }
1204
1205 static long tce_iommu_take_ownership(struct tce_container *container,
1206                 struct iommu_table_group *table_group)
1207 {
1208         long i, ret = 0;
1209
1210         /* Set all windows to the new group */
1211         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1212                 struct iommu_table *tbl = container->tables[i];
1213
1214                 if (!tbl)
1215                         continue;
1216
1217                 ret = table_group->ops->set_window(table_group, i, tbl);
1218                 if (ret)
1219                         goto release_exit;
1220         }
1221
1222         return 0;
1223
1224 release_exit:
1225         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1226                 table_group->ops->unset_window(table_group, i);
1227
1228         return ret;
1229 }
1230
1231 static int tce_iommu_attach_group(void *iommu_data,
1232                 struct iommu_group *iommu_group, enum vfio_group_type type)
1233 {
1234         int ret = 0;
1235         struct tce_container *container = iommu_data;
1236         struct iommu_table_group *table_group;
1237         struct tce_iommu_group *tcegrp = NULL;
1238
1239         if (type == VFIO_EMULATED_IOMMU)
1240                 return -EINVAL;
1241
1242         mutex_lock(&container->lock);
1243
1244         /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1245                         iommu_group_id(iommu_group), iommu_group); */
1246         table_group = iommu_group_get_iommudata(iommu_group);
1247         if (!table_group) {
1248                 ret = -ENODEV;
1249                 goto unlock_exit;
1250         }
1251
1252         /* v2 requires full support of dynamic DMA windows */
1253         if (container->v2 && table_group->max_dynamic_windows_supported == 0) {
1254                 ret = -EINVAL;
1255                 goto unlock_exit;
1256         }
1257
1258         /* v1 reuses TCE tables and does not share them among PEs */
1259         if (!container->v2 && tce_groups_attached(container)) {
1260                 ret = -EBUSY;
1261                 goto unlock_exit;
1262         }
1263
1264         /*
1265          * Check if new group has the same iommu_table_group_ops
1266          * (i.e. compatible)
1267          */
1268         list_for_each_entry(tcegrp, &container->group_list, next) {
1269                 struct iommu_table_group *table_group_tmp;
1270
1271                 if (tcegrp->grp == iommu_group) {
1272                         pr_warn("tce_vfio: Group %d is already attached\n",
1273                                         iommu_group_id(iommu_group));
1274                         ret = -EBUSY;
1275                         goto unlock_exit;
1276                 }
1277                 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1278                 if (table_group_tmp->ops->create_table !=
1279                                 table_group->ops->create_table) {
1280                         pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1281                                         iommu_group_id(iommu_group),
1282                                         iommu_group_id(tcegrp->grp));
1283                         ret = -EPERM;
1284                         goto unlock_exit;
1285                 }
1286         }
1287
1288         tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1289         if (!tcegrp) {
1290                 ret = -ENOMEM;
1291                 goto unlock_exit;
1292         }
1293
1294         ret = tce_iommu_take_ownership(container, table_group);
1295         if (!tce_groups_attached(container) && !container->tables[0])
1296                 container->def_window_pending = true;
1297
1298         if (!ret) {
1299                 tcegrp->grp = iommu_group;
1300                 list_add(&tcegrp->next, &container->group_list);
1301         }
1302
1303         if (ret && tcegrp)
1304                 kfree(tcegrp);
1305
1306 unlock_exit:
1307         mutex_unlock(&container->lock);
1308
1309         return ret;
1310 }
1311
1312 static void tce_iommu_detach_group(void *iommu_data,
1313                 struct iommu_group *iommu_group)
1314 {
1315         struct tce_container *container = iommu_data;
1316         struct iommu_table_group *table_group;
1317         bool found = false;
1318         struct tce_iommu_group *tcegrp;
1319
1320         mutex_lock(&container->lock);
1321
1322         list_for_each_entry(tcegrp, &container->group_list, next) {
1323                 if (tcegrp->grp == iommu_group) {
1324                         found = true;
1325                         break;
1326                 }
1327         }
1328
1329         if (!found) {
1330                 pr_warn("tce_vfio: detaching unattached group #%u\n",
1331                                 iommu_group_id(iommu_group));
1332                 goto unlock_exit;
1333         }
1334
1335         list_del(&tcegrp->next);
1336         kfree(tcegrp);
1337
1338         table_group = iommu_group_get_iommudata(iommu_group);
1339         BUG_ON(!table_group);
1340
1341         tce_iommu_release_ownership(container, table_group);
1342
1343 unlock_exit:
1344         mutex_unlock(&container->lock);
1345 }
1346
1347 static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1348         .name           = "iommu-vfio-powerpc",
1349         .owner          = THIS_MODULE,
1350         .open           = tce_iommu_open,
1351         .release        = tce_iommu_release,
1352         .ioctl          = tce_iommu_ioctl,
1353         .attach_group   = tce_iommu_attach_group,
1354         .detach_group   = tce_iommu_detach_group,
1355 };
1356
1357 static int __init tce_iommu_init(void)
1358 {
1359         return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1360 }
1361
1362 static void __exit tce_iommu_cleanup(void)
1363 {
1364         vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1365 }
1366
1367 module_init(tce_iommu_init);
1368 module_exit(tce_iommu_cleanup);
1369
1370 MODULE_VERSION(DRIVER_VERSION);
1371 MODULE_LICENSE("GPL v2");
1372 MODULE_AUTHOR(DRIVER_AUTHOR);
1373 MODULE_DESCRIPTION(DRIVER_DESC);
1374