Merge tag 'drm-misc-next-2018-07-18' of git://anongit.freedesktop.org/drm/drm-misc...
[platform/kernel/linux-rpi.git] / drivers / gpu / drm / i915 / gvt / kvmgt.c
1 /*
2  * KVMGT - the implementation of Intel mediated pass-through framework for KVM
3  *
4  * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  *
25  * Authors:
26  *    Kevin Tian <kevin.tian@intel.com>
27  *    Jike Song <jike.song@intel.com>
28  *    Xiaoguang Chen <xiaoguang.chen@intel.com>
29  */
30
31 #include <linux/init.h>
32 #include <linux/device.h>
33 #include <linux/mm.h>
34 #include <linux/mmu_context.h>
35 #include <linux/types.h>
36 #include <linux/list.h>
37 #include <linux/rbtree.h>
38 #include <linux/spinlock.h>
39 #include <linux/eventfd.h>
40 #include <linux/uuid.h>
41 #include <linux/kvm_host.h>
42 #include <linux/vfio.h>
43 #include <linux/mdev.h>
44 #include <linux/debugfs.h>
45
46 #include "i915_drv.h"
47 #include "gvt.h"
48
49 static const struct intel_gvt_ops *intel_gvt_ops;
50
51 /* helper macros copied from vfio-pci */
52 #define VFIO_PCI_OFFSET_SHIFT   40
53 #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
54 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
55 #define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
56
57 #define OPREGION_SIGNATURE "IntelGraphicsMem"
58
59 struct vfio_region;
60 struct intel_vgpu_regops {
61         size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
62                         size_t count, loff_t *ppos, bool iswrite);
63         void (*release)(struct intel_vgpu *vgpu,
64                         struct vfio_region *region);
65 };
66
67 struct vfio_region {
68         u32                             type;
69         u32                             subtype;
70         size_t                          size;
71         u32                             flags;
72         const struct intel_vgpu_regops  *ops;
73         void                            *data;
74 };
75
76 struct kvmgt_pgfn {
77         gfn_t gfn;
78         struct hlist_node hnode;
79 };
80
81 struct kvmgt_guest_info {
82         struct kvm *kvm;
83         struct intel_vgpu *vgpu;
84         struct kvm_page_track_notifier_node track_node;
85 #define NR_BKT (1 << 18)
86         struct hlist_head ptable[NR_BKT];
87 #undef NR_BKT
88         struct dentry *debugfs_cache_entries;
89 };
90
91 struct gvt_dma {
92         struct intel_vgpu *vgpu;
93         struct rb_node gfn_node;
94         struct rb_node dma_addr_node;
95         gfn_t gfn;
96         dma_addr_t dma_addr;
97         struct kref ref;
98 };
99
100 static inline bool handle_valid(unsigned long handle)
101 {
102         return !!(handle & ~0xff);
103 }
104
105 static int kvmgt_guest_init(struct mdev_device *mdev);
106 static void intel_vgpu_release_work(struct work_struct *work);
107 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
108
109 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
110                 dma_addr_t *dma_addr)
111 {
112         struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
113         struct page *page;
114         unsigned long pfn;
115         int ret;
116
117         /* Pin the page first. */
118         ret = vfio_pin_pages(mdev_dev(vgpu->vdev.mdev), &gfn, 1,
119                              IOMMU_READ | IOMMU_WRITE, &pfn);
120         if (ret != 1) {
121                 gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx: %d\n",
122                              gfn, ret);
123                 return -EINVAL;
124         }
125
126         if (!pfn_valid(pfn)) {
127                 gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
128                 vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &gfn, 1);
129                 return -EINVAL;
130         }
131
132         /* Setup DMA mapping. */
133         page = pfn_to_page(pfn);
134         *dma_addr = dma_map_page(dev, page, 0, PAGE_SIZE,
135                                  PCI_DMA_BIDIRECTIONAL);
136         if (dma_mapping_error(dev, *dma_addr)) {
137                 gvt_vgpu_err("DMA mapping failed for gfn 0x%lx\n", gfn);
138                 vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &gfn, 1);
139                 return -ENOMEM;
140         }
141
142         return 0;
143 }
144
145 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
146                 dma_addr_t dma_addr)
147 {
148         struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
149         int ret;
150
151         dma_unmap_page(dev, dma_addr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
152         ret = vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &gfn, 1);
153         WARN_ON(ret != 1);
154 }
155
156 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
157                 dma_addr_t dma_addr)
158 {
159         struct rb_node *node = vgpu->vdev.dma_addr_cache.rb_node;
160         struct gvt_dma *itr;
161
162         while (node) {
163                 itr = rb_entry(node, struct gvt_dma, dma_addr_node);
164
165                 if (dma_addr < itr->dma_addr)
166                         node = node->rb_left;
167                 else if (dma_addr > itr->dma_addr)
168                         node = node->rb_right;
169                 else
170                         return itr;
171         }
172         return NULL;
173 }
174
175 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
176 {
177         struct rb_node *node = vgpu->vdev.gfn_cache.rb_node;
178         struct gvt_dma *itr;
179
180         while (node) {
181                 itr = rb_entry(node, struct gvt_dma, gfn_node);
182
183                 if (gfn < itr->gfn)
184                         node = node->rb_left;
185                 else if (gfn > itr->gfn)
186                         node = node->rb_right;
187                 else
188                         return itr;
189         }
190         return NULL;
191 }
192
193 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
194                 dma_addr_t dma_addr)
195 {
196         struct gvt_dma *new, *itr;
197         struct rb_node **link, *parent = NULL;
198
199         new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
200         if (!new)
201                 return -ENOMEM;
202
203         new->vgpu = vgpu;
204         new->gfn = gfn;
205         new->dma_addr = dma_addr;
206         kref_init(&new->ref);
207
208         /* gfn_cache maps gfn to struct gvt_dma. */
209         link = &vgpu->vdev.gfn_cache.rb_node;
210         while (*link) {
211                 parent = *link;
212                 itr = rb_entry(parent, struct gvt_dma, gfn_node);
213
214                 if (gfn < itr->gfn)
215                         link = &parent->rb_left;
216                 else
217                         link = &parent->rb_right;
218         }
219         rb_link_node(&new->gfn_node, parent, link);
220         rb_insert_color(&new->gfn_node, &vgpu->vdev.gfn_cache);
221
222         /* dma_addr_cache maps dma addr to struct gvt_dma. */
223         parent = NULL;
224         link = &vgpu->vdev.dma_addr_cache.rb_node;
225         while (*link) {
226                 parent = *link;
227                 itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
228
229                 if (dma_addr < itr->dma_addr)
230                         link = &parent->rb_left;
231                 else
232                         link = &parent->rb_right;
233         }
234         rb_link_node(&new->dma_addr_node, parent, link);
235         rb_insert_color(&new->dma_addr_node, &vgpu->vdev.dma_addr_cache);
236
237         vgpu->vdev.nr_cache_entries++;
238         return 0;
239 }
240
241 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
242                                 struct gvt_dma *entry)
243 {
244         rb_erase(&entry->gfn_node, &vgpu->vdev.gfn_cache);
245         rb_erase(&entry->dma_addr_node, &vgpu->vdev.dma_addr_cache);
246         kfree(entry);
247         vgpu->vdev.nr_cache_entries--;
248 }
249
250 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
251 {
252         struct gvt_dma *dma;
253         struct rb_node *node = NULL;
254
255         for (;;) {
256                 mutex_lock(&vgpu->vdev.cache_lock);
257                 node = rb_first(&vgpu->vdev.gfn_cache);
258                 if (!node) {
259                         mutex_unlock(&vgpu->vdev.cache_lock);
260                         break;
261                 }
262                 dma = rb_entry(node, struct gvt_dma, gfn_node);
263                 gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr);
264                 __gvt_cache_remove_entry(vgpu, dma);
265                 mutex_unlock(&vgpu->vdev.cache_lock);
266         }
267 }
268
269 static void gvt_cache_init(struct intel_vgpu *vgpu)
270 {
271         vgpu->vdev.gfn_cache = RB_ROOT;
272         vgpu->vdev.dma_addr_cache = RB_ROOT;
273         vgpu->vdev.nr_cache_entries = 0;
274         mutex_init(&vgpu->vdev.cache_lock);
275 }
276
277 static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
278 {
279         hash_init(info->ptable);
280 }
281
282 static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
283 {
284         struct kvmgt_pgfn *p;
285         struct hlist_node *tmp;
286         int i;
287
288         hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
289                 hash_del(&p->hnode);
290                 kfree(p);
291         }
292 }
293
294 static struct kvmgt_pgfn *
295 __kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
296 {
297         struct kvmgt_pgfn *p, *res = NULL;
298
299         hash_for_each_possible(info->ptable, p, hnode, gfn) {
300                 if (gfn == p->gfn) {
301                         res = p;
302                         break;
303                 }
304         }
305
306         return res;
307 }
308
309 static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
310                                 gfn_t gfn)
311 {
312         struct kvmgt_pgfn *p;
313
314         p = __kvmgt_protect_table_find(info, gfn);
315         return !!p;
316 }
317
318 static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
319 {
320         struct kvmgt_pgfn *p;
321
322         if (kvmgt_gfn_is_write_protected(info, gfn))
323                 return;
324
325         p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
326         if (WARN(!p, "gfn: 0x%llx\n", gfn))
327                 return;
328
329         p->gfn = gfn;
330         hash_add(info->ptable, &p->hnode, gfn);
331 }
332
333 static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
334                                 gfn_t gfn)
335 {
336         struct kvmgt_pgfn *p;
337
338         p = __kvmgt_protect_table_find(info, gfn);
339         if (p) {
340                 hash_del(&p->hnode);
341                 kfree(p);
342         }
343 }
344
345 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
346                 size_t count, loff_t *ppos, bool iswrite)
347 {
348         unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
349                         VFIO_PCI_NUM_REGIONS;
350         void *base = vgpu->vdev.region[i].data;
351         loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
352
353         if (pos >= vgpu->vdev.region[i].size || iswrite) {
354                 gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
355                 return -EINVAL;
356         }
357         count = min(count, (size_t)(vgpu->vdev.region[i].size - pos));
358         memcpy(buf, base + pos, count);
359
360         return count;
361 }
362
363 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
364                 struct vfio_region *region)
365 {
366 }
367
368 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
369         .rw = intel_vgpu_reg_rw_opregion,
370         .release = intel_vgpu_reg_release_opregion,
371 };
372
373 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
374                 unsigned int type, unsigned int subtype,
375                 const struct intel_vgpu_regops *ops,
376                 size_t size, u32 flags, void *data)
377 {
378         struct vfio_region *region;
379
380         region = krealloc(vgpu->vdev.region,
381                         (vgpu->vdev.num_regions + 1) * sizeof(*region),
382                         GFP_KERNEL);
383         if (!region)
384                 return -ENOMEM;
385
386         vgpu->vdev.region = region;
387         vgpu->vdev.region[vgpu->vdev.num_regions].type = type;
388         vgpu->vdev.region[vgpu->vdev.num_regions].subtype = subtype;
389         vgpu->vdev.region[vgpu->vdev.num_regions].ops = ops;
390         vgpu->vdev.region[vgpu->vdev.num_regions].size = size;
391         vgpu->vdev.region[vgpu->vdev.num_regions].flags = flags;
392         vgpu->vdev.region[vgpu->vdev.num_regions].data = data;
393         vgpu->vdev.num_regions++;
394         return 0;
395 }
396
397 static int kvmgt_get_vfio_device(void *p_vgpu)
398 {
399         struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
400
401         vgpu->vdev.vfio_device = vfio_device_get_from_dev(
402                 mdev_dev(vgpu->vdev.mdev));
403         if (!vgpu->vdev.vfio_device) {
404                 gvt_vgpu_err("failed to get vfio device\n");
405                 return -ENODEV;
406         }
407         return 0;
408 }
409
410
411 static int kvmgt_set_opregion(void *p_vgpu)
412 {
413         struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
414         void *base;
415         int ret;
416
417         /* Each vgpu has its own opregion, although VFIO would create another
418          * one later. This one is used to expose opregion to VFIO. And the
419          * other one created by VFIO later, is used by guest actually.
420          */
421         base = vgpu_opregion(vgpu)->va;
422         if (!base)
423                 return -ENOMEM;
424
425         if (memcmp(base, OPREGION_SIGNATURE, 16)) {
426                 memunmap(base);
427                 return -EINVAL;
428         }
429
430         ret = intel_vgpu_register_reg(vgpu,
431                         PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
432                         VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
433                         &intel_vgpu_regops_opregion, OPREGION_SIZE,
434                         VFIO_REGION_INFO_FLAG_READ, base);
435
436         return ret;
437 }
438
439 static void kvmgt_put_vfio_device(void *vgpu)
440 {
441         if (WARN_ON(!((struct intel_vgpu *)vgpu)->vdev.vfio_device))
442                 return;
443
444         vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device);
445 }
446
447 static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev)
448 {
449         struct intel_vgpu *vgpu = NULL;
450         struct intel_vgpu_type *type;
451         struct device *pdev;
452         void *gvt;
453         int ret;
454
455         pdev = mdev_parent_dev(mdev);
456         gvt = kdev_to_i915(pdev)->gvt;
457
458         type = intel_gvt_ops->gvt_find_vgpu_type(gvt, kobject_name(kobj));
459         if (!type) {
460                 gvt_vgpu_err("failed to find type %s to create\n",
461                                                 kobject_name(kobj));
462                 ret = -EINVAL;
463                 goto out;
464         }
465
466         vgpu = intel_gvt_ops->vgpu_create(gvt, type);
467         if (IS_ERR_OR_NULL(vgpu)) {
468                 ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu);
469                 gvt_err("failed to create intel vgpu: %d\n", ret);
470                 goto out;
471         }
472
473         INIT_WORK(&vgpu->vdev.release_work, intel_vgpu_release_work);
474
475         vgpu->vdev.mdev = mdev;
476         mdev_set_drvdata(mdev, vgpu);
477
478         gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
479                      dev_name(mdev_dev(mdev)));
480         ret = 0;
481
482 out:
483         return ret;
484 }
485
486 static int intel_vgpu_remove(struct mdev_device *mdev)
487 {
488         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
489
490         if (handle_valid(vgpu->handle))
491                 return -EBUSY;
492
493         intel_gvt_ops->vgpu_destroy(vgpu);
494         return 0;
495 }
496
497 static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
498                                      unsigned long action, void *data)
499 {
500         struct intel_vgpu *vgpu = container_of(nb,
501                                         struct intel_vgpu,
502                                         vdev.iommu_notifier);
503
504         if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
505                 struct vfio_iommu_type1_dma_unmap *unmap = data;
506                 struct gvt_dma *entry;
507                 unsigned long iov_pfn, end_iov_pfn;
508
509                 iov_pfn = unmap->iova >> PAGE_SHIFT;
510                 end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
511
512                 mutex_lock(&vgpu->vdev.cache_lock);
513                 for (; iov_pfn < end_iov_pfn; iov_pfn++) {
514                         entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
515                         if (!entry)
516                                 continue;
517
518                         gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr);
519                         __gvt_cache_remove_entry(vgpu, entry);
520                 }
521                 mutex_unlock(&vgpu->vdev.cache_lock);
522         }
523
524         return NOTIFY_OK;
525 }
526
527 static int intel_vgpu_group_notifier(struct notifier_block *nb,
528                                      unsigned long action, void *data)
529 {
530         struct intel_vgpu *vgpu = container_of(nb,
531                                         struct intel_vgpu,
532                                         vdev.group_notifier);
533
534         /* the only action we care about */
535         if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
536                 vgpu->vdev.kvm = data;
537
538                 if (!data)
539                         schedule_work(&vgpu->vdev.release_work);
540         }
541
542         return NOTIFY_OK;
543 }
544
545 static int intel_vgpu_open(struct mdev_device *mdev)
546 {
547         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
548         unsigned long events;
549         int ret;
550
551         vgpu->vdev.iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
552         vgpu->vdev.group_notifier.notifier_call = intel_vgpu_group_notifier;
553
554         events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
555         ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events,
556                                 &vgpu->vdev.iommu_notifier);
557         if (ret != 0) {
558                 gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
559                         ret);
560                 goto out;
561         }
562
563         events = VFIO_GROUP_NOTIFY_SET_KVM;
564         ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events,
565                                 &vgpu->vdev.group_notifier);
566         if (ret != 0) {
567                 gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
568                         ret);
569                 goto undo_iommu;
570         }
571
572         ret = kvmgt_guest_init(mdev);
573         if (ret)
574                 goto undo_group;
575
576         intel_gvt_ops->vgpu_activate(vgpu);
577
578         atomic_set(&vgpu->vdev.released, 0);
579         return ret;
580
581 undo_group:
582         vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
583                                         &vgpu->vdev.group_notifier);
584
585 undo_iommu:
586         vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
587                                         &vgpu->vdev.iommu_notifier);
588 out:
589         return ret;
590 }
591
592 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
593 {
594         struct eventfd_ctx *trigger;
595
596         trigger = vgpu->vdev.msi_trigger;
597         if (trigger) {
598                 eventfd_ctx_put(trigger);
599                 vgpu->vdev.msi_trigger = NULL;
600         }
601 }
602
603 static void __intel_vgpu_release(struct intel_vgpu *vgpu)
604 {
605         struct kvmgt_guest_info *info;
606         int ret;
607
608         if (!handle_valid(vgpu->handle))
609                 return;
610
611         if (atomic_cmpxchg(&vgpu->vdev.released, 0, 1))
612                 return;
613
614         intel_gvt_ops->vgpu_deactivate(vgpu);
615
616         ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_IOMMU_NOTIFY,
617                                         &vgpu->vdev.iommu_notifier);
618         WARN(ret, "vfio_unregister_notifier for iommu failed: %d\n", ret);
619
620         ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_GROUP_NOTIFY,
621                                         &vgpu->vdev.group_notifier);
622         WARN(ret, "vfio_unregister_notifier for group failed: %d\n", ret);
623
624         info = (struct kvmgt_guest_info *)vgpu->handle;
625         kvmgt_guest_exit(info);
626
627         intel_vgpu_release_msi_eventfd_ctx(vgpu);
628
629         vgpu->vdev.kvm = NULL;
630         vgpu->handle = 0;
631 }
632
633 static void intel_vgpu_release(struct mdev_device *mdev)
634 {
635         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
636
637         __intel_vgpu_release(vgpu);
638 }
639
640 static void intel_vgpu_release_work(struct work_struct *work)
641 {
642         struct intel_vgpu *vgpu = container_of(work, struct intel_vgpu,
643                                         vdev.release_work);
644
645         __intel_vgpu_release(vgpu);
646 }
647
648 static uint64_t intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
649 {
650         u32 start_lo, start_hi;
651         u32 mem_type;
652
653         start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
654                         PCI_BASE_ADDRESS_MEM_MASK;
655         mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
656                         PCI_BASE_ADDRESS_MEM_TYPE_MASK;
657
658         switch (mem_type) {
659         case PCI_BASE_ADDRESS_MEM_TYPE_64:
660                 start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
661                                                 + bar + 4));
662                 break;
663         case PCI_BASE_ADDRESS_MEM_TYPE_32:
664         case PCI_BASE_ADDRESS_MEM_TYPE_1M:
665                 /* 1M mem BAR treated as 32-bit BAR */
666         default:
667                 /* mem unknown type treated as 32-bit BAR */
668                 start_hi = 0;
669                 break;
670         }
671
672         return ((u64)start_hi << 32) | start_lo;
673 }
674
675 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, uint64_t off,
676                              void *buf, unsigned int count, bool is_write)
677 {
678         uint64_t bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
679         int ret;
680
681         if (is_write)
682                 ret = intel_gvt_ops->emulate_mmio_write(vgpu,
683                                         bar_start + off, buf, count);
684         else
685                 ret = intel_gvt_ops->emulate_mmio_read(vgpu,
686                                         bar_start + off, buf, count);
687         return ret;
688 }
689
690 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, uint64_t off)
691 {
692         return off >= vgpu_aperture_offset(vgpu) &&
693                off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
694 }
695
696 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, uint64_t off,
697                 void *buf, unsigned long count, bool is_write)
698 {
699         void *aperture_va;
700
701         if (!intel_vgpu_in_aperture(vgpu, off) ||
702             !intel_vgpu_in_aperture(vgpu, off + count)) {
703                 gvt_vgpu_err("Invalid aperture offset %llu\n", off);
704                 return -EINVAL;
705         }
706
707         aperture_va = io_mapping_map_wc(&vgpu->gvt->dev_priv->ggtt.iomap,
708                                         ALIGN_DOWN(off, PAGE_SIZE),
709                                         count + offset_in_page(off));
710         if (!aperture_va)
711                 return -EIO;
712
713         if (is_write)
714                 memcpy(aperture_va + offset_in_page(off), buf, count);
715         else
716                 memcpy(buf, aperture_va + offset_in_page(off), count);
717
718         io_mapping_unmap(aperture_va);
719
720         return 0;
721 }
722
723 static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
724                         size_t count, loff_t *ppos, bool is_write)
725 {
726         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
727         unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
728         uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
729         int ret = -EINVAL;
730
731
732         if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) {
733                 gvt_vgpu_err("invalid index: %u\n", index);
734                 return -EINVAL;
735         }
736
737         switch (index) {
738         case VFIO_PCI_CONFIG_REGION_INDEX:
739                 if (is_write)
740                         ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
741                                                 buf, count);
742                 else
743                         ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
744                                                 buf, count);
745                 break;
746         case VFIO_PCI_BAR0_REGION_INDEX:
747                 ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
748                                         buf, count, is_write);
749                 break;
750         case VFIO_PCI_BAR2_REGION_INDEX:
751                 ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
752                 break;
753         case VFIO_PCI_BAR1_REGION_INDEX:
754         case VFIO_PCI_BAR3_REGION_INDEX:
755         case VFIO_PCI_BAR4_REGION_INDEX:
756         case VFIO_PCI_BAR5_REGION_INDEX:
757         case VFIO_PCI_VGA_REGION_INDEX:
758         case VFIO_PCI_ROM_REGION_INDEX:
759                 break;
760         default:
761                 if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions)
762                         return -EINVAL;
763
764                 index -= VFIO_PCI_NUM_REGIONS;
765                 return vgpu->vdev.region[index].ops->rw(vgpu, buf, count,
766                                 ppos, is_write);
767         }
768
769         return ret == 0 ? count : ret;
770 }
771
772 static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
773 {
774         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
775         unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
776         struct intel_gvt *gvt = vgpu->gvt;
777         int offset;
778
779         /* Only allow MMIO GGTT entry access */
780         if (index != PCI_BASE_ADDRESS_0)
781                 return false;
782
783         offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
784                 intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
785
786         return (offset >= gvt->device_info.gtt_start_offset &&
787                 offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
788                         true : false;
789 }
790
791 static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
792                         size_t count, loff_t *ppos)
793 {
794         unsigned int done = 0;
795         int ret;
796
797         while (count) {
798                 size_t filled;
799
800                 /* Only support GGTT entry 8 bytes read */
801                 if (count >= 8 && !(*ppos % 8) &&
802                         gtt_entry(mdev, ppos)) {
803                         u64 val;
804
805                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
806                                         ppos, false);
807                         if (ret <= 0)
808                                 goto read_err;
809
810                         if (copy_to_user(buf, &val, sizeof(val)))
811                                 goto read_err;
812
813                         filled = 8;
814                 } else if (count >= 4 && !(*ppos % 4)) {
815                         u32 val;
816
817                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
818                                         ppos, false);
819                         if (ret <= 0)
820                                 goto read_err;
821
822                         if (copy_to_user(buf, &val, sizeof(val)))
823                                 goto read_err;
824
825                         filled = 4;
826                 } else if (count >= 2 && !(*ppos % 2)) {
827                         u16 val;
828
829                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
830                                         ppos, false);
831                         if (ret <= 0)
832                                 goto read_err;
833
834                         if (copy_to_user(buf, &val, sizeof(val)))
835                                 goto read_err;
836
837                         filled = 2;
838                 } else {
839                         u8 val;
840
841                         ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
842                                         false);
843                         if (ret <= 0)
844                                 goto read_err;
845
846                         if (copy_to_user(buf, &val, sizeof(val)))
847                                 goto read_err;
848
849                         filled = 1;
850                 }
851
852                 count -= filled;
853                 done += filled;
854                 *ppos += filled;
855                 buf += filled;
856         }
857
858         return done;
859
860 read_err:
861         return -EFAULT;
862 }
863
864 static ssize_t intel_vgpu_write(struct mdev_device *mdev,
865                                 const char __user *buf,
866                                 size_t count, loff_t *ppos)
867 {
868         unsigned int done = 0;
869         int ret;
870
871         while (count) {
872                 size_t filled;
873
874                 /* Only support GGTT entry 8 bytes write */
875                 if (count >= 8 && !(*ppos % 8) &&
876                         gtt_entry(mdev, ppos)) {
877                         u64 val;
878
879                         if (copy_from_user(&val, buf, sizeof(val)))
880                                 goto write_err;
881
882                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
883                                         ppos, true);
884                         if (ret <= 0)
885                                 goto write_err;
886
887                         filled = 8;
888                 } else if (count >= 4 && !(*ppos % 4)) {
889                         u32 val;
890
891                         if (copy_from_user(&val, buf, sizeof(val)))
892                                 goto write_err;
893
894                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
895                                         ppos, true);
896                         if (ret <= 0)
897                                 goto write_err;
898
899                         filled = 4;
900                 } else if (count >= 2 && !(*ppos % 2)) {
901                         u16 val;
902
903                         if (copy_from_user(&val, buf, sizeof(val)))
904                                 goto write_err;
905
906                         ret = intel_vgpu_rw(mdev, (char *)&val,
907                                         sizeof(val), ppos, true);
908                         if (ret <= 0)
909                                 goto write_err;
910
911                         filled = 2;
912                 } else {
913                         u8 val;
914
915                         if (copy_from_user(&val, buf, sizeof(val)))
916                                 goto write_err;
917
918                         ret = intel_vgpu_rw(mdev, &val, sizeof(val),
919                                         ppos, true);
920                         if (ret <= 0)
921                                 goto write_err;
922
923                         filled = 1;
924                 }
925
926                 count -= filled;
927                 done += filled;
928                 *ppos += filled;
929                 buf += filled;
930         }
931
932         return done;
933 write_err:
934         return -EFAULT;
935 }
936
937 static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
938 {
939         unsigned int index;
940         u64 virtaddr;
941         unsigned long req_size, pgoff = 0;
942         pgprot_t pg_prot;
943         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
944
945         index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
946         if (index >= VFIO_PCI_ROM_REGION_INDEX)
947                 return -EINVAL;
948
949         if (vma->vm_end < vma->vm_start)
950                 return -EINVAL;
951         if ((vma->vm_flags & VM_SHARED) == 0)
952                 return -EINVAL;
953         if (index != VFIO_PCI_BAR2_REGION_INDEX)
954                 return -EINVAL;
955
956         pg_prot = vma->vm_page_prot;
957         virtaddr = vma->vm_start;
958         req_size = vma->vm_end - vma->vm_start;
959         pgoff = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT;
960
961         return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
962 }
963
964 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
965 {
966         if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
967                 return 1;
968
969         return 0;
970 }
971
972 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
973                         unsigned int index, unsigned int start,
974                         unsigned int count, uint32_t flags,
975                         void *data)
976 {
977         return 0;
978 }
979
980 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
981                         unsigned int index, unsigned int start,
982                         unsigned int count, uint32_t flags, void *data)
983 {
984         return 0;
985 }
986
987 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
988                 unsigned int index, unsigned int start, unsigned int count,
989                 uint32_t flags, void *data)
990 {
991         return 0;
992 }
993
994 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
995                 unsigned int index, unsigned int start, unsigned int count,
996                 uint32_t flags, void *data)
997 {
998         struct eventfd_ctx *trigger;
999
1000         if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1001                 int fd = *(int *)data;
1002
1003                 trigger = eventfd_ctx_fdget(fd);
1004                 if (IS_ERR(trigger)) {
1005                         gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1006                         return PTR_ERR(trigger);
1007                 }
1008                 vgpu->vdev.msi_trigger = trigger;
1009         } else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1010                 intel_vgpu_release_msi_eventfd_ctx(vgpu);
1011
1012         return 0;
1013 }
1014
1015 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, uint32_t flags,
1016                 unsigned int index, unsigned int start, unsigned int count,
1017                 void *data)
1018 {
1019         int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1020                         unsigned int start, unsigned int count, uint32_t flags,
1021                         void *data) = NULL;
1022
1023         switch (index) {
1024         case VFIO_PCI_INTX_IRQ_INDEX:
1025                 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1026                 case VFIO_IRQ_SET_ACTION_MASK:
1027                         func = intel_vgpu_set_intx_mask;
1028                         break;
1029                 case VFIO_IRQ_SET_ACTION_UNMASK:
1030                         func = intel_vgpu_set_intx_unmask;
1031                         break;
1032                 case VFIO_IRQ_SET_ACTION_TRIGGER:
1033                         func = intel_vgpu_set_intx_trigger;
1034                         break;
1035                 }
1036                 break;
1037         case VFIO_PCI_MSI_IRQ_INDEX:
1038                 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1039                 case VFIO_IRQ_SET_ACTION_MASK:
1040                 case VFIO_IRQ_SET_ACTION_UNMASK:
1041                         /* XXX Need masking support exported */
1042                         break;
1043                 case VFIO_IRQ_SET_ACTION_TRIGGER:
1044                         func = intel_vgpu_set_msi_trigger;
1045                         break;
1046                 }
1047                 break;
1048         }
1049
1050         if (!func)
1051                 return -ENOTTY;
1052
1053         return func(vgpu, index, start, count, flags, data);
1054 }
1055
1056 static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
1057                              unsigned long arg)
1058 {
1059         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1060         unsigned long minsz;
1061
1062         gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1063
1064         if (cmd == VFIO_DEVICE_GET_INFO) {
1065                 struct vfio_device_info info;
1066
1067                 minsz = offsetofend(struct vfio_device_info, num_irqs);
1068
1069                 if (copy_from_user(&info, (void __user *)arg, minsz))
1070                         return -EFAULT;
1071
1072                 if (info.argsz < minsz)
1073                         return -EINVAL;
1074
1075                 info.flags = VFIO_DEVICE_FLAGS_PCI;
1076                 info.flags |= VFIO_DEVICE_FLAGS_RESET;
1077                 info.num_regions = VFIO_PCI_NUM_REGIONS +
1078                                 vgpu->vdev.num_regions;
1079                 info.num_irqs = VFIO_PCI_NUM_IRQS;
1080
1081                 return copy_to_user((void __user *)arg, &info, minsz) ?
1082                         -EFAULT : 0;
1083
1084         } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1085                 struct vfio_region_info info;
1086                 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1087                 int i, ret;
1088                 struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1089                 size_t size;
1090                 int nr_areas = 1;
1091                 int cap_type_id;
1092
1093                 minsz = offsetofend(struct vfio_region_info, offset);
1094
1095                 if (copy_from_user(&info, (void __user *)arg, minsz))
1096                         return -EFAULT;
1097
1098                 if (info.argsz < minsz)
1099                         return -EINVAL;
1100
1101                 switch (info.index) {
1102                 case VFIO_PCI_CONFIG_REGION_INDEX:
1103                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1104                         info.size = vgpu->gvt->device_info.cfg_space_size;
1105                         info.flags = VFIO_REGION_INFO_FLAG_READ |
1106                                      VFIO_REGION_INFO_FLAG_WRITE;
1107                         break;
1108                 case VFIO_PCI_BAR0_REGION_INDEX:
1109                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1110                         info.size = vgpu->cfg_space.bar[info.index].size;
1111                         if (!info.size) {
1112                                 info.flags = 0;
1113                                 break;
1114                         }
1115
1116                         info.flags = VFIO_REGION_INFO_FLAG_READ |
1117                                      VFIO_REGION_INFO_FLAG_WRITE;
1118                         break;
1119                 case VFIO_PCI_BAR1_REGION_INDEX:
1120                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1121                         info.size = 0;
1122                         info.flags = 0;
1123                         break;
1124                 case VFIO_PCI_BAR2_REGION_INDEX:
1125                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1126                         info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1127                                         VFIO_REGION_INFO_FLAG_MMAP |
1128                                         VFIO_REGION_INFO_FLAG_READ |
1129                                         VFIO_REGION_INFO_FLAG_WRITE;
1130                         info.size = gvt_aperture_sz(vgpu->gvt);
1131
1132                         size = sizeof(*sparse) +
1133                                         (nr_areas * sizeof(*sparse->areas));
1134                         sparse = kzalloc(size, GFP_KERNEL);
1135                         if (!sparse)
1136                                 return -ENOMEM;
1137
1138                         sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1139                         sparse->header.version = 1;
1140                         sparse->nr_areas = nr_areas;
1141                         cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1142                         sparse->areas[0].offset =
1143                                         PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1144                         sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1145                         break;
1146
1147                 case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1148                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1149                         info.size = 0;
1150                         info.flags = 0;
1151
1152                         gvt_dbg_core("get region info bar:%d\n", info.index);
1153                         break;
1154
1155                 case VFIO_PCI_ROM_REGION_INDEX:
1156                 case VFIO_PCI_VGA_REGION_INDEX:
1157                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1158                         info.size = 0;
1159                         info.flags = 0;
1160
1161                         gvt_dbg_core("get region info index:%d\n", info.index);
1162                         break;
1163                 default:
1164                         {
1165                                 struct vfio_region_info_cap_type cap_type = {
1166                                         .header.id = VFIO_REGION_INFO_CAP_TYPE,
1167                                         .header.version = 1 };
1168
1169                                 if (info.index >= VFIO_PCI_NUM_REGIONS +
1170                                                 vgpu->vdev.num_regions)
1171                                         return -EINVAL;
1172
1173                                 i = info.index - VFIO_PCI_NUM_REGIONS;
1174
1175                                 info.offset =
1176                                         VFIO_PCI_INDEX_TO_OFFSET(info.index);
1177                                 info.size = vgpu->vdev.region[i].size;
1178                                 info.flags = vgpu->vdev.region[i].flags;
1179
1180                                 cap_type.type = vgpu->vdev.region[i].type;
1181                                 cap_type.subtype = vgpu->vdev.region[i].subtype;
1182
1183                                 ret = vfio_info_add_capability(&caps,
1184                                                         &cap_type.header,
1185                                                         sizeof(cap_type));
1186                                 if (ret)
1187                                         return ret;
1188                         }
1189                 }
1190
1191                 if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1192                         switch (cap_type_id) {
1193                         case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1194                                 ret = vfio_info_add_capability(&caps,
1195                                         &sparse->header, sizeof(*sparse) +
1196                                         (sparse->nr_areas *
1197                                                 sizeof(*sparse->areas)));
1198                                 kfree(sparse);
1199                                 if (ret)
1200                                         return ret;
1201                                 break;
1202                         default:
1203                                 return -EINVAL;
1204                         }
1205                 }
1206
1207                 if (caps.size) {
1208                         info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1209                         if (info.argsz < sizeof(info) + caps.size) {
1210                                 info.argsz = sizeof(info) + caps.size;
1211                                 info.cap_offset = 0;
1212                         } else {
1213                                 vfio_info_cap_shift(&caps, sizeof(info));
1214                                 if (copy_to_user((void __user *)arg +
1215                                                   sizeof(info), caps.buf,
1216                                                   caps.size)) {
1217                                         kfree(caps.buf);
1218                                         return -EFAULT;
1219                                 }
1220                                 info.cap_offset = sizeof(info);
1221                         }
1222
1223                         kfree(caps.buf);
1224                 }
1225
1226                 return copy_to_user((void __user *)arg, &info, minsz) ?
1227                         -EFAULT : 0;
1228         } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1229                 struct vfio_irq_info info;
1230
1231                 minsz = offsetofend(struct vfio_irq_info, count);
1232
1233                 if (copy_from_user(&info, (void __user *)arg, minsz))
1234                         return -EFAULT;
1235
1236                 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1237                         return -EINVAL;
1238
1239                 switch (info.index) {
1240                 case VFIO_PCI_INTX_IRQ_INDEX:
1241                 case VFIO_PCI_MSI_IRQ_INDEX:
1242                         break;
1243                 default:
1244                         return -EINVAL;
1245                 }
1246
1247                 info.flags = VFIO_IRQ_INFO_EVENTFD;
1248
1249                 info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1250
1251                 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1252                         info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1253                                        VFIO_IRQ_INFO_AUTOMASKED);
1254                 else
1255                         info.flags |= VFIO_IRQ_INFO_NORESIZE;
1256
1257                 return copy_to_user((void __user *)arg, &info, minsz) ?
1258                         -EFAULT : 0;
1259         } else if (cmd == VFIO_DEVICE_SET_IRQS) {
1260                 struct vfio_irq_set hdr;
1261                 u8 *data = NULL;
1262                 int ret = 0;
1263                 size_t data_size = 0;
1264
1265                 minsz = offsetofend(struct vfio_irq_set, count);
1266
1267                 if (copy_from_user(&hdr, (void __user *)arg, minsz))
1268                         return -EFAULT;
1269
1270                 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1271                         int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1272
1273                         ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1274                                                 VFIO_PCI_NUM_IRQS, &data_size);
1275                         if (ret) {
1276                                 gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1277                                 return -EINVAL;
1278                         }
1279                         if (data_size) {
1280                                 data = memdup_user((void __user *)(arg + minsz),
1281                                                    data_size);
1282                                 if (IS_ERR(data))
1283                                         return PTR_ERR(data);
1284                         }
1285                 }
1286
1287                 ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1288                                         hdr.start, hdr.count, data);
1289                 kfree(data);
1290
1291                 return ret;
1292         } else if (cmd == VFIO_DEVICE_RESET) {
1293                 intel_gvt_ops->vgpu_reset(vgpu);
1294                 return 0;
1295         } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1296                 struct vfio_device_gfx_plane_info dmabuf;
1297                 int ret = 0;
1298
1299                 minsz = offsetofend(struct vfio_device_gfx_plane_info,
1300                                     dmabuf_id);
1301                 if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1302                         return -EFAULT;
1303                 if (dmabuf.argsz < minsz)
1304                         return -EINVAL;
1305
1306                 ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf);
1307                 if (ret != 0)
1308                         return ret;
1309
1310                 return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1311                                                                 -EFAULT : 0;
1312         } else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1313                 __u32 dmabuf_id;
1314                 __s32 dmabuf_fd;
1315
1316                 if (get_user(dmabuf_id, (__u32 __user *)arg))
1317                         return -EFAULT;
1318
1319                 dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id);
1320                 return dmabuf_fd;
1321
1322         }
1323
1324         return -ENOTTY;
1325 }
1326
1327 static ssize_t
1328 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1329              char *buf)
1330 {
1331         struct mdev_device *mdev = mdev_from_dev(dev);
1332
1333         if (mdev) {
1334                 struct intel_vgpu *vgpu = (struct intel_vgpu *)
1335                         mdev_get_drvdata(mdev);
1336                 return sprintf(buf, "%d\n", vgpu->id);
1337         }
1338         return sprintf(buf, "\n");
1339 }
1340
1341 static ssize_t
1342 hw_id_show(struct device *dev, struct device_attribute *attr,
1343            char *buf)
1344 {
1345         struct mdev_device *mdev = mdev_from_dev(dev);
1346
1347         if (mdev) {
1348                 struct intel_vgpu *vgpu = (struct intel_vgpu *)
1349                         mdev_get_drvdata(mdev);
1350                 return sprintf(buf, "%u\n",
1351                                vgpu->submission.shadow_ctx->hw_id);
1352         }
1353         return sprintf(buf, "\n");
1354 }
1355
1356 static DEVICE_ATTR_RO(vgpu_id);
1357 static DEVICE_ATTR_RO(hw_id);
1358
1359 static struct attribute *intel_vgpu_attrs[] = {
1360         &dev_attr_vgpu_id.attr,
1361         &dev_attr_hw_id.attr,
1362         NULL
1363 };
1364
1365 static const struct attribute_group intel_vgpu_group = {
1366         .name = "intel_vgpu",
1367         .attrs = intel_vgpu_attrs,
1368 };
1369
1370 static const struct attribute_group *intel_vgpu_groups[] = {
1371         &intel_vgpu_group,
1372         NULL,
1373 };
1374
1375 static struct mdev_parent_ops intel_vgpu_ops = {
1376         .mdev_attr_groups       = intel_vgpu_groups,
1377         .create                 = intel_vgpu_create,
1378         .remove                 = intel_vgpu_remove,
1379
1380         .open                   = intel_vgpu_open,
1381         .release                = intel_vgpu_release,
1382
1383         .read                   = intel_vgpu_read,
1384         .write                  = intel_vgpu_write,
1385         .mmap                   = intel_vgpu_mmap,
1386         .ioctl                  = intel_vgpu_ioctl,
1387 };
1388
1389 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
1390 {
1391         struct attribute **kvm_type_attrs;
1392         struct attribute_group **kvm_vgpu_type_groups;
1393
1394         intel_gvt_ops = ops;
1395         if (!intel_gvt_ops->get_gvt_attrs(&kvm_type_attrs,
1396                         &kvm_vgpu_type_groups))
1397                 return -EFAULT;
1398         intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups;
1399
1400         return mdev_register_device(dev, &intel_vgpu_ops);
1401 }
1402
1403 static void kvmgt_host_exit(struct device *dev, void *gvt)
1404 {
1405         mdev_unregister_device(dev);
1406 }
1407
1408 static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
1409 {
1410         struct kvmgt_guest_info *info;
1411         struct kvm *kvm;
1412         struct kvm_memory_slot *slot;
1413         int idx;
1414
1415         if (!handle_valid(handle))
1416                 return -ESRCH;
1417
1418         info = (struct kvmgt_guest_info *)handle;
1419         kvm = info->kvm;
1420
1421         idx = srcu_read_lock(&kvm->srcu);
1422         slot = gfn_to_memslot(kvm, gfn);
1423         if (!slot) {
1424                 srcu_read_unlock(&kvm->srcu, idx);
1425                 return -EINVAL;
1426         }
1427
1428         spin_lock(&kvm->mmu_lock);
1429
1430         if (kvmgt_gfn_is_write_protected(info, gfn))
1431                 goto out;
1432
1433         kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1434         kvmgt_protect_table_add(info, gfn);
1435
1436 out:
1437         spin_unlock(&kvm->mmu_lock);
1438         srcu_read_unlock(&kvm->srcu, idx);
1439         return 0;
1440 }
1441
1442 static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
1443 {
1444         struct kvmgt_guest_info *info;
1445         struct kvm *kvm;
1446         struct kvm_memory_slot *slot;
1447         int idx;
1448
1449         if (!handle_valid(handle))
1450                 return 0;
1451
1452         info = (struct kvmgt_guest_info *)handle;
1453         kvm = info->kvm;
1454
1455         idx = srcu_read_lock(&kvm->srcu);
1456         slot = gfn_to_memslot(kvm, gfn);
1457         if (!slot) {
1458                 srcu_read_unlock(&kvm->srcu, idx);
1459                 return -EINVAL;
1460         }
1461
1462         spin_lock(&kvm->mmu_lock);
1463
1464         if (!kvmgt_gfn_is_write_protected(info, gfn))
1465                 goto out;
1466
1467         kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1468         kvmgt_protect_table_del(info, gfn);
1469
1470 out:
1471         spin_unlock(&kvm->mmu_lock);
1472         srcu_read_unlock(&kvm->srcu, idx);
1473         return 0;
1474 }
1475
1476 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1477                 const u8 *val, int len,
1478                 struct kvm_page_track_notifier_node *node)
1479 {
1480         struct kvmgt_guest_info *info = container_of(node,
1481                                         struct kvmgt_guest_info, track_node);
1482
1483         if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1484                 intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
1485                                                      (void *)val, len);
1486 }
1487
1488 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1489                 struct kvm_memory_slot *slot,
1490                 struct kvm_page_track_notifier_node *node)
1491 {
1492         int i;
1493         gfn_t gfn;
1494         struct kvmgt_guest_info *info = container_of(node,
1495                                         struct kvmgt_guest_info, track_node);
1496
1497         spin_lock(&kvm->mmu_lock);
1498         for (i = 0; i < slot->npages; i++) {
1499                 gfn = slot->base_gfn + i;
1500                 if (kvmgt_gfn_is_write_protected(info, gfn)) {
1501                         kvm_slot_page_track_remove_page(kvm, slot, gfn,
1502                                                 KVM_PAGE_TRACK_WRITE);
1503                         kvmgt_protect_table_del(info, gfn);
1504                 }
1505         }
1506         spin_unlock(&kvm->mmu_lock);
1507 }
1508
1509 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
1510 {
1511         struct intel_vgpu *itr;
1512         struct kvmgt_guest_info *info;
1513         int id;
1514         bool ret = false;
1515
1516         mutex_lock(&vgpu->gvt->lock);
1517         for_each_active_vgpu(vgpu->gvt, itr, id) {
1518                 if (!handle_valid(itr->handle))
1519                         continue;
1520
1521                 info = (struct kvmgt_guest_info *)itr->handle;
1522                 if (kvm && kvm == info->kvm) {
1523                         ret = true;
1524                         goto out;
1525                 }
1526         }
1527 out:
1528         mutex_unlock(&vgpu->gvt->lock);
1529         return ret;
1530 }
1531
1532 static int kvmgt_guest_init(struct mdev_device *mdev)
1533 {
1534         struct kvmgt_guest_info *info;
1535         struct intel_vgpu *vgpu;
1536         struct kvm *kvm;
1537
1538         vgpu = mdev_get_drvdata(mdev);
1539         if (handle_valid(vgpu->handle))
1540                 return -EEXIST;
1541
1542         kvm = vgpu->vdev.kvm;
1543         if (!kvm || kvm->mm != current->mm) {
1544                 gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1545                 return -ESRCH;
1546         }
1547
1548         if (__kvmgt_vgpu_exist(vgpu, kvm))
1549                 return -EEXIST;
1550
1551         info = vzalloc(sizeof(struct kvmgt_guest_info));
1552         if (!info)
1553                 return -ENOMEM;
1554
1555         vgpu->handle = (unsigned long)info;
1556         info->vgpu = vgpu;
1557         info->kvm = kvm;
1558         kvm_get_kvm(info->kvm);
1559
1560         kvmgt_protect_table_init(info);
1561         gvt_cache_init(vgpu);
1562
1563         mutex_init(&vgpu->dmabuf_lock);
1564         init_completion(&vgpu->vblank_done);
1565
1566         info->track_node.track_write = kvmgt_page_track_write;
1567         info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
1568         kvm_page_track_register_notifier(kvm, &info->track_node);
1569
1570         info->debugfs_cache_entries = debugfs_create_ulong(
1571                                                 "kvmgt_nr_cache_entries",
1572                                                 0444, vgpu->debugfs,
1573                                                 &vgpu->vdev.nr_cache_entries);
1574         if (!info->debugfs_cache_entries)
1575                 gvt_vgpu_err("Cannot create kvmgt debugfs entry\n");
1576
1577         return 0;
1578 }
1579
1580 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
1581 {
1582         debugfs_remove(info->debugfs_cache_entries);
1583
1584         kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
1585         kvm_put_kvm(info->kvm);
1586         kvmgt_protect_table_destroy(info);
1587         gvt_cache_destroy(info->vgpu);
1588         vfree(info);
1589
1590         return true;
1591 }
1592
1593 static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle)
1594 {
1595         /* nothing to do here */
1596         return 0;
1597 }
1598
1599 static void kvmgt_detach_vgpu(unsigned long handle)
1600 {
1601         /* nothing to do here */
1602 }
1603
1604 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
1605 {
1606         struct kvmgt_guest_info *info;
1607         struct intel_vgpu *vgpu;
1608
1609         if (!handle_valid(handle))
1610                 return -ESRCH;
1611
1612         info = (struct kvmgt_guest_info *)handle;
1613         vgpu = info->vgpu;
1614
1615         /*
1616          * When guest is poweroff, msi_trigger is set to NULL, but vgpu's
1617          * config and mmio register isn't restored to default during guest
1618          * poweroff. If this vgpu is still used in next vm, this vgpu's pipe
1619          * may be enabled, then once this vgpu is active, it will get inject
1620          * vblank interrupt request. But msi_trigger is null until msi is
1621          * enabled by guest. so if msi_trigger is null, success is still
1622          * returned and don't inject interrupt into guest.
1623          */
1624         if (vgpu->vdev.msi_trigger == NULL)
1625                 return 0;
1626
1627         if (eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1)
1628                 return 0;
1629
1630         return -EFAULT;
1631 }
1632
1633 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
1634 {
1635         struct kvmgt_guest_info *info;
1636         kvm_pfn_t pfn;
1637
1638         if (!handle_valid(handle))
1639                 return INTEL_GVT_INVALID_ADDR;
1640
1641         info = (struct kvmgt_guest_info *)handle;
1642
1643         pfn = gfn_to_pfn(info->kvm, gfn);
1644         if (is_error_noslot_pfn(pfn))
1645                 return INTEL_GVT_INVALID_ADDR;
1646
1647         return pfn;
1648 }
1649
1650 int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn,
1651                 dma_addr_t *dma_addr)
1652 {
1653         struct kvmgt_guest_info *info;
1654         struct intel_vgpu *vgpu;
1655         struct gvt_dma *entry;
1656         int ret;
1657
1658         if (!handle_valid(handle))
1659                 return -EINVAL;
1660
1661         info = (struct kvmgt_guest_info *)handle;
1662         vgpu = info->vgpu;
1663
1664         mutex_lock(&info->vgpu->vdev.cache_lock);
1665
1666         entry = __gvt_cache_find_gfn(info->vgpu, gfn);
1667         if (!entry) {
1668                 ret = gvt_dma_map_page(vgpu, gfn, dma_addr);
1669                 if (ret)
1670                         goto err_unlock;
1671
1672                 ret = __gvt_cache_add(info->vgpu, gfn, *dma_addr);
1673                 if (ret)
1674                         goto err_unmap;
1675         } else {
1676                 kref_get(&entry->ref);
1677                 *dma_addr = entry->dma_addr;
1678         }
1679
1680         mutex_unlock(&info->vgpu->vdev.cache_lock);
1681         return 0;
1682
1683 err_unmap:
1684         gvt_dma_unmap_page(vgpu, gfn, *dma_addr);
1685 err_unlock:
1686         mutex_unlock(&info->vgpu->vdev.cache_lock);
1687         return ret;
1688 }
1689
1690 static void __gvt_dma_release(struct kref *ref)
1691 {
1692         struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
1693
1694         gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr);
1695         __gvt_cache_remove_entry(entry->vgpu, entry);
1696 }
1697
1698 void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr)
1699 {
1700         struct kvmgt_guest_info *info;
1701         struct gvt_dma *entry;
1702
1703         if (!handle_valid(handle))
1704                 return;
1705
1706         info = (struct kvmgt_guest_info *)handle;
1707
1708         mutex_lock(&info->vgpu->vdev.cache_lock);
1709         entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
1710         if (entry)
1711                 kref_put(&entry->ref, __gvt_dma_release);
1712         mutex_unlock(&info->vgpu->vdev.cache_lock);
1713 }
1714
1715 static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
1716                         void *buf, unsigned long len, bool write)
1717 {
1718         struct kvmgt_guest_info *info;
1719         struct kvm *kvm;
1720         int idx, ret;
1721         bool kthread = current->mm == NULL;
1722
1723         if (!handle_valid(handle))
1724                 return -ESRCH;
1725
1726         info = (struct kvmgt_guest_info *)handle;
1727         kvm = info->kvm;
1728
1729         if (kthread)
1730                 use_mm(kvm->mm);
1731
1732         idx = srcu_read_lock(&kvm->srcu);
1733         ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
1734                       kvm_read_guest(kvm, gpa, buf, len);
1735         srcu_read_unlock(&kvm->srcu, idx);
1736
1737         if (kthread)
1738                 unuse_mm(kvm->mm);
1739
1740         return ret;
1741 }
1742
1743 static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
1744                         void *buf, unsigned long len)
1745 {
1746         return kvmgt_rw_gpa(handle, gpa, buf, len, false);
1747 }
1748
1749 static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
1750                         void *buf, unsigned long len)
1751 {
1752         return kvmgt_rw_gpa(handle, gpa, buf, len, true);
1753 }
1754
1755 static unsigned long kvmgt_virt_to_pfn(void *addr)
1756 {
1757         return PFN_DOWN(__pa(addr));
1758 }
1759
1760 static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
1761 {
1762         struct kvmgt_guest_info *info;
1763         struct kvm *kvm;
1764
1765         if (!handle_valid(handle))
1766                 return false;
1767
1768         info = (struct kvmgt_guest_info *)handle;
1769         kvm = info->kvm;
1770
1771         return kvm_is_visible_gfn(kvm, gfn);
1772
1773 }
1774
1775 struct intel_gvt_mpt kvmgt_mpt = {
1776         .host_init = kvmgt_host_init,
1777         .host_exit = kvmgt_host_exit,
1778         .attach_vgpu = kvmgt_attach_vgpu,
1779         .detach_vgpu = kvmgt_detach_vgpu,
1780         .inject_msi = kvmgt_inject_msi,
1781         .from_virt_to_mfn = kvmgt_virt_to_pfn,
1782         .enable_page_track = kvmgt_page_track_add,
1783         .disable_page_track = kvmgt_page_track_remove,
1784         .read_gpa = kvmgt_read_gpa,
1785         .write_gpa = kvmgt_write_gpa,
1786         .gfn_to_mfn = kvmgt_gfn_to_pfn,
1787         .dma_map_guest_page = kvmgt_dma_map_guest_page,
1788         .dma_unmap_guest_page = kvmgt_dma_unmap_guest_page,
1789         .set_opregion = kvmgt_set_opregion,
1790         .get_vfio_device = kvmgt_get_vfio_device,
1791         .put_vfio_device = kvmgt_put_vfio_device,
1792         .is_valid_gfn = kvmgt_is_valid_gfn,
1793 };
1794 EXPORT_SYMBOL_GPL(kvmgt_mpt);
1795
1796 static int __init kvmgt_init(void)
1797 {
1798         return 0;
1799 }
1800
1801 static void __exit kvmgt_exit(void)
1802 {
1803 }
1804
1805 module_init(kvmgt_init);
1806 module_exit(kvmgt_exit);
1807
1808 MODULE_LICENSE("GPL and additional rights");
1809 MODULE_AUTHOR("Intel Corporation");