2 * KVMGT - the implementation of Intel mediated pass-through framework for KVM
4 * Copyright(c) 2011-2016 Intel Corporation. All rights reserved.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Kevin Tian <kevin.tian@intel.com>
27 * Jike Song <jike.song@intel.com>
28 * Xiaoguang Chen <xiaoguang.chen@intel.com>
29 * Eddie Dong <eddie.dong@intel.com>
32 * Niu Bing <bing.niu@intel.com>
33 * Zhi Wang <zhi.a.wang@intel.com>
36 #include <linux/init.h>
38 #include <linux/kthread.h>
39 #include <linux/sched/mm.h>
40 #include <linux/types.h>
41 #include <linux/list.h>
42 #include <linux/rbtree.h>
43 #include <linux/spinlock.h>
44 #include <linux/eventfd.h>
45 #include <linux/mdev.h>
46 #include <linux/debugfs.h>
48 #include <linux/nospec.h>
50 #include <drm/drm_edid.h>
53 #include "intel_gvt.h"
56 MODULE_IMPORT_NS(DMA_BUF);
57 MODULE_IMPORT_NS(I915_GVT);
59 /* helper macros copied from vfio-pci */
60 #define VFIO_PCI_OFFSET_SHIFT 40
61 #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
62 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
63 #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
65 #define EDID_BLOB_OFFSET (PAGE_SIZE/2)
67 #define OPREGION_SIGNATURE "IntelGraphicsMem"
70 struct intel_vgpu_regops {
71 size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
72 size_t count, loff_t *ppos, bool iswrite);
73 void (*release)(struct intel_vgpu *vgpu,
74 struct vfio_region *region);
82 const struct intel_vgpu_regops *ops;
86 struct vfio_edid_region {
87 struct vfio_region_gfx_edid vfio_edid_regs;
93 struct hlist_node hnode;
97 struct intel_vgpu *vgpu;
98 struct rb_node gfn_node;
99 struct rb_node dma_addr_node;
106 #define vfio_dev_to_vgpu(vfio_dev) \
107 container_of((vfio_dev), struct intel_vgpu, vfio_device)
109 static void kvmgt_page_track_write(gpa_t gpa, const u8 *val, int len,
110 struct kvm_page_track_notifier_node *node);
111 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
112 struct kvm_memory_slot *slot,
113 struct kvm_page_track_notifier_node *node);
115 static ssize_t intel_vgpu_show_description(struct mdev_type *mtype, char *buf)
117 struct intel_vgpu_type *type =
118 container_of(mtype, struct intel_vgpu_type, type);
120 return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n"
121 "fence: %d\nresolution: %s\n"
123 BYTES_TO_MB(type->conf->low_mm),
124 BYTES_TO_MB(type->conf->high_mm),
125 type->conf->fence, vgpu_edid_str(type->conf->edid),
129 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
132 vfio_unpin_pages(&vgpu->vfio_device, gfn << PAGE_SHIFT,
133 DIV_ROUND_UP(size, PAGE_SIZE));
136 /* Pin a normal or compound guest page for dma. */
137 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
138 unsigned long size, struct page **page)
140 int total_pages = DIV_ROUND_UP(size, PAGE_SIZE);
141 struct page *base_page = NULL;
146 * We pin the pages one-by-one to avoid allocating a big arrary
147 * on stack to hold pfns.
149 for (npage = 0; npage < total_pages; npage++) {
150 dma_addr_t cur_iova = (gfn + npage) << PAGE_SHIFT;
151 struct page *cur_page;
153 ret = vfio_pin_pages(&vgpu->vfio_device, cur_iova, 1,
154 IOMMU_READ | IOMMU_WRITE, &cur_page);
156 gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
162 base_page = cur_page;
163 else if (page_to_pfn(base_page) + npage != page_to_pfn(cur_page)) {
174 gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
178 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
179 dma_addr_t *dma_addr, unsigned long size)
181 struct device *dev = vgpu->gvt->gt->i915->drm.dev;
182 struct page *page = NULL;
185 ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
189 /* Setup DMA mapping. */
190 *dma_addr = dma_map_page(dev, page, 0, size, DMA_BIDIRECTIONAL);
191 if (dma_mapping_error(dev, *dma_addr)) {
192 gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
193 page_to_pfn(page), ret);
194 gvt_unpin_guest_page(vgpu, gfn, size);
201 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
202 dma_addr_t dma_addr, unsigned long size)
204 struct device *dev = vgpu->gvt->gt->i915->drm.dev;
206 dma_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL);
207 gvt_unpin_guest_page(vgpu, gfn, size);
210 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
213 struct rb_node *node = vgpu->dma_addr_cache.rb_node;
217 itr = rb_entry(node, struct gvt_dma, dma_addr_node);
219 if (dma_addr < itr->dma_addr)
220 node = node->rb_left;
221 else if (dma_addr > itr->dma_addr)
222 node = node->rb_right;
229 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
231 struct rb_node *node = vgpu->gfn_cache.rb_node;
235 itr = rb_entry(node, struct gvt_dma, gfn_node);
238 node = node->rb_left;
239 else if (gfn > itr->gfn)
240 node = node->rb_right;
247 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
248 dma_addr_t dma_addr, unsigned long size)
250 struct gvt_dma *new, *itr;
251 struct rb_node **link, *parent = NULL;
253 new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
259 new->dma_addr = dma_addr;
261 kref_init(&new->ref);
263 /* gfn_cache maps gfn to struct gvt_dma. */
264 link = &vgpu->gfn_cache.rb_node;
267 itr = rb_entry(parent, struct gvt_dma, gfn_node);
270 link = &parent->rb_left;
272 link = &parent->rb_right;
274 rb_link_node(&new->gfn_node, parent, link);
275 rb_insert_color(&new->gfn_node, &vgpu->gfn_cache);
277 /* dma_addr_cache maps dma addr to struct gvt_dma. */
279 link = &vgpu->dma_addr_cache.rb_node;
282 itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
284 if (dma_addr < itr->dma_addr)
285 link = &parent->rb_left;
287 link = &parent->rb_right;
289 rb_link_node(&new->dma_addr_node, parent, link);
290 rb_insert_color(&new->dma_addr_node, &vgpu->dma_addr_cache);
292 vgpu->nr_cache_entries++;
296 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
297 struct gvt_dma *entry)
299 rb_erase(&entry->gfn_node, &vgpu->gfn_cache);
300 rb_erase(&entry->dma_addr_node, &vgpu->dma_addr_cache);
302 vgpu->nr_cache_entries--;
305 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
308 struct rb_node *node = NULL;
311 mutex_lock(&vgpu->cache_lock);
312 node = rb_first(&vgpu->gfn_cache);
314 mutex_unlock(&vgpu->cache_lock);
317 dma = rb_entry(node, struct gvt_dma, gfn_node);
318 gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
319 __gvt_cache_remove_entry(vgpu, dma);
320 mutex_unlock(&vgpu->cache_lock);
324 static void gvt_cache_init(struct intel_vgpu *vgpu)
326 vgpu->gfn_cache = RB_ROOT;
327 vgpu->dma_addr_cache = RB_ROOT;
328 vgpu->nr_cache_entries = 0;
329 mutex_init(&vgpu->cache_lock);
332 static void kvmgt_protect_table_init(struct intel_vgpu *info)
334 hash_init(info->ptable);
337 static void kvmgt_protect_table_destroy(struct intel_vgpu *info)
339 struct kvmgt_pgfn *p;
340 struct hlist_node *tmp;
343 hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
349 static struct kvmgt_pgfn *
350 __kvmgt_protect_table_find(struct intel_vgpu *info, gfn_t gfn)
352 struct kvmgt_pgfn *p, *res = NULL;
354 lockdep_assert_held(&info->vgpu_lock);
356 hash_for_each_possible(info->ptable, p, hnode, gfn) {
366 static bool kvmgt_gfn_is_write_protected(struct intel_vgpu *info, gfn_t gfn)
368 struct kvmgt_pgfn *p;
370 p = __kvmgt_protect_table_find(info, gfn);
374 static void kvmgt_protect_table_add(struct intel_vgpu *info, gfn_t gfn)
376 struct kvmgt_pgfn *p;
378 if (kvmgt_gfn_is_write_protected(info, gfn))
381 p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
382 if (WARN(!p, "gfn: 0x%llx\n", gfn))
386 hash_add(info->ptable, &p->hnode, gfn);
389 static void kvmgt_protect_table_del(struct intel_vgpu *info, gfn_t gfn)
391 struct kvmgt_pgfn *p;
393 p = __kvmgt_protect_table_find(info, gfn);
400 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
401 size_t count, loff_t *ppos, bool iswrite)
403 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
404 VFIO_PCI_NUM_REGIONS;
405 void *base = vgpu->region[i].data;
406 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
409 if (pos >= vgpu->region[i].size || iswrite) {
410 gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
413 count = min(count, (size_t)(vgpu->region[i].size - pos));
414 memcpy(buf, base + pos, count);
419 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
420 struct vfio_region *region)
424 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
425 .rw = intel_vgpu_reg_rw_opregion,
426 .release = intel_vgpu_reg_release_opregion,
429 static int handle_edid_regs(struct intel_vgpu *vgpu,
430 struct vfio_edid_region *region, char *buf,
431 size_t count, u16 offset, bool is_write)
433 struct vfio_region_gfx_edid *regs = ®ion->vfio_edid_regs;
436 if (offset + count > sizeof(*regs))
443 data = *((unsigned int *)buf);
445 case offsetof(struct vfio_region_gfx_edid, link_state):
446 if (data == VFIO_DEVICE_GFX_LINK_STATE_UP) {
447 if (!drm_edid_block_valid(
448 (u8 *)region->edid_blob,
452 gvt_vgpu_err("invalid EDID blob\n");
455 intel_vgpu_emulate_hotplug(vgpu, true);
456 } else if (data == VFIO_DEVICE_GFX_LINK_STATE_DOWN)
457 intel_vgpu_emulate_hotplug(vgpu, false);
459 gvt_vgpu_err("invalid EDID link state %d\n",
463 regs->link_state = data;
465 case offsetof(struct vfio_region_gfx_edid, edid_size):
466 if (data > regs->edid_max_size) {
467 gvt_vgpu_err("EDID size is bigger than %d!\n",
468 regs->edid_max_size);
471 regs->edid_size = data;
475 gvt_vgpu_err("write read-only EDID region at offset %d\n",
480 memcpy(buf, (char *)regs + offset, count);
486 static int handle_edid_blob(struct vfio_edid_region *region, char *buf,
487 size_t count, u16 offset, bool is_write)
489 if (offset + count > region->vfio_edid_regs.edid_size)
493 memcpy(region->edid_blob + offset, buf, count);
495 memcpy(buf, region->edid_blob + offset, count);
500 static size_t intel_vgpu_reg_rw_edid(struct intel_vgpu *vgpu, char *buf,
501 size_t count, loff_t *ppos, bool iswrite)
504 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
505 VFIO_PCI_NUM_REGIONS;
506 struct vfio_edid_region *region = vgpu->region[i].data;
507 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
509 if (pos < region->vfio_edid_regs.edid_offset) {
510 ret = handle_edid_regs(vgpu, region, buf, count, pos, iswrite);
512 pos -= EDID_BLOB_OFFSET;
513 ret = handle_edid_blob(region, buf, count, pos, iswrite);
517 gvt_vgpu_err("failed to access EDID region\n");
522 static void intel_vgpu_reg_release_edid(struct intel_vgpu *vgpu,
523 struct vfio_region *region)
528 static const struct intel_vgpu_regops intel_vgpu_regops_edid = {
529 .rw = intel_vgpu_reg_rw_edid,
530 .release = intel_vgpu_reg_release_edid,
533 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
534 unsigned int type, unsigned int subtype,
535 const struct intel_vgpu_regops *ops,
536 size_t size, u32 flags, void *data)
538 struct vfio_region *region;
540 region = krealloc(vgpu->region,
541 (vgpu->num_regions + 1) * sizeof(*region),
546 vgpu->region = region;
547 vgpu->region[vgpu->num_regions].type = type;
548 vgpu->region[vgpu->num_regions].subtype = subtype;
549 vgpu->region[vgpu->num_regions].ops = ops;
550 vgpu->region[vgpu->num_regions].size = size;
551 vgpu->region[vgpu->num_regions].flags = flags;
552 vgpu->region[vgpu->num_regions].data = data;
557 int intel_gvt_set_opregion(struct intel_vgpu *vgpu)
562 /* Each vgpu has its own opregion, although VFIO would create another
563 * one later. This one is used to expose opregion to VFIO. And the
564 * other one created by VFIO later, is used by guest actually.
566 base = vgpu_opregion(vgpu)->va;
570 if (memcmp(base, OPREGION_SIGNATURE, 16)) {
575 ret = intel_vgpu_register_reg(vgpu,
576 PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
577 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
578 &intel_vgpu_regops_opregion, OPREGION_SIZE,
579 VFIO_REGION_INFO_FLAG_READ, base);
584 int intel_gvt_set_edid(struct intel_vgpu *vgpu, int port_num)
586 struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
587 struct vfio_edid_region *base;
590 base = kzalloc(sizeof(*base), GFP_KERNEL);
594 /* TODO: Add multi-port and EDID extension block support */
595 base->vfio_edid_regs.edid_offset = EDID_BLOB_OFFSET;
596 base->vfio_edid_regs.edid_max_size = EDID_SIZE;
597 base->vfio_edid_regs.edid_size = EDID_SIZE;
598 base->vfio_edid_regs.max_xres = vgpu_edid_xres(port->id);
599 base->vfio_edid_regs.max_yres = vgpu_edid_yres(port->id);
600 base->edid_blob = port->edid->edid_block;
602 ret = intel_vgpu_register_reg(vgpu,
603 VFIO_REGION_TYPE_GFX,
604 VFIO_REGION_SUBTYPE_GFX_EDID,
605 &intel_vgpu_regops_edid, EDID_SIZE,
606 VFIO_REGION_INFO_FLAG_READ |
607 VFIO_REGION_INFO_FLAG_WRITE |
608 VFIO_REGION_INFO_FLAG_CAPS, base);
613 static void intel_vgpu_dma_unmap(struct vfio_device *vfio_dev, u64 iova,
616 struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
617 struct gvt_dma *entry;
618 u64 iov_pfn = iova >> PAGE_SHIFT;
619 u64 end_iov_pfn = iov_pfn + length / PAGE_SIZE;
621 mutex_lock(&vgpu->cache_lock);
622 for (; iov_pfn < end_iov_pfn; iov_pfn++) {
623 entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
627 gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
629 __gvt_cache_remove_entry(vgpu, entry);
631 mutex_unlock(&vgpu->cache_lock);
634 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu)
636 struct intel_vgpu *itr;
640 mutex_lock(&vgpu->gvt->lock);
641 for_each_active_vgpu(vgpu->gvt, itr, id) {
642 if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, itr->status))
645 if (vgpu->vfio_device.kvm == itr->vfio_device.kvm) {
651 mutex_unlock(&vgpu->gvt->lock);
655 static int intel_vgpu_open_device(struct vfio_device *vfio_dev)
657 struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
659 if (!vgpu->vfio_device.kvm ||
660 vgpu->vfio_device.kvm->mm != current->mm) {
661 gvt_vgpu_err("KVM is required to use Intel vGPU\n");
665 if (__kvmgt_vgpu_exist(vgpu))
668 vgpu->track_node.track_write = kvmgt_page_track_write;
669 vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
670 kvm_get_kvm(vgpu->vfio_device.kvm);
671 kvm_page_track_register_notifier(vgpu->vfio_device.kvm,
674 set_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status);
676 debugfs_create_ulong(KVMGT_DEBUGFS_FILENAME, 0444, vgpu->debugfs,
677 &vgpu->nr_cache_entries);
679 intel_gvt_activate_vgpu(vgpu);
684 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
686 struct eventfd_ctx *trigger;
688 trigger = vgpu->msi_trigger;
690 eventfd_ctx_put(trigger);
691 vgpu->msi_trigger = NULL;
695 static void intel_vgpu_close_device(struct vfio_device *vfio_dev)
697 struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
699 intel_gvt_release_vgpu(vgpu);
701 clear_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status);
703 debugfs_lookup_and_remove(KVMGT_DEBUGFS_FILENAME, vgpu->debugfs);
705 kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm,
707 kvm_put_kvm(vgpu->vfio_device.kvm);
709 kvmgt_protect_table_destroy(vgpu);
710 gvt_cache_destroy(vgpu);
712 WARN_ON(vgpu->nr_cache_entries);
714 vgpu->gfn_cache = RB_ROOT;
715 vgpu->dma_addr_cache = RB_ROOT;
717 intel_vgpu_release_msi_eventfd_ctx(vgpu);
720 static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
722 u32 start_lo, start_hi;
725 start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
726 PCI_BASE_ADDRESS_MEM_MASK;
727 mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
728 PCI_BASE_ADDRESS_MEM_TYPE_MASK;
731 case PCI_BASE_ADDRESS_MEM_TYPE_64:
732 start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
735 case PCI_BASE_ADDRESS_MEM_TYPE_32:
736 case PCI_BASE_ADDRESS_MEM_TYPE_1M:
737 /* 1M mem BAR treated as 32-bit BAR */
739 /* mem unknown type treated as 32-bit BAR */
744 return ((u64)start_hi << 32) | start_lo;
747 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, u64 off,
748 void *buf, unsigned int count, bool is_write)
750 u64 bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
754 ret = intel_vgpu_emulate_mmio_write(vgpu,
755 bar_start + off, buf, count);
757 ret = intel_vgpu_emulate_mmio_read(vgpu,
758 bar_start + off, buf, count);
762 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, u64 off)
764 return off >= vgpu_aperture_offset(vgpu) &&
765 off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
768 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, u64 off,
769 void *buf, unsigned long count, bool is_write)
771 void __iomem *aperture_va;
773 if (!intel_vgpu_in_aperture(vgpu, off) ||
774 !intel_vgpu_in_aperture(vgpu, off + count)) {
775 gvt_vgpu_err("Invalid aperture offset %llu\n", off);
779 aperture_va = io_mapping_map_wc(&vgpu->gvt->gt->ggtt->iomap,
780 ALIGN_DOWN(off, PAGE_SIZE),
781 count + offset_in_page(off));
786 memcpy_toio(aperture_va + offset_in_page(off), buf, count);
788 memcpy_fromio(buf, aperture_va + offset_in_page(off), count);
790 io_mapping_unmap(aperture_va);
795 static ssize_t intel_vgpu_rw(struct intel_vgpu *vgpu, char *buf,
796 size_t count, loff_t *ppos, bool is_write)
798 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
799 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
803 if (index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) {
804 gvt_vgpu_err("invalid index: %u\n", index);
809 case VFIO_PCI_CONFIG_REGION_INDEX:
811 ret = intel_vgpu_emulate_cfg_write(vgpu, pos,
814 ret = intel_vgpu_emulate_cfg_read(vgpu, pos,
817 case VFIO_PCI_BAR0_REGION_INDEX:
818 ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
819 buf, count, is_write);
821 case VFIO_PCI_BAR2_REGION_INDEX:
822 ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
824 case VFIO_PCI_BAR1_REGION_INDEX:
825 case VFIO_PCI_BAR3_REGION_INDEX:
826 case VFIO_PCI_BAR4_REGION_INDEX:
827 case VFIO_PCI_BAR5_REGION_INDEX:
828 case VFIO_PCI_VGA_REGION_INDEX:
829 case VFIO_PCI_ROM_REGION_INDEX:
832 if (index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions)
835 index -= VFIO_PCI_NUM_REGIONS;
836 return vgpu->region[index].ops->rw(vgpu, buf, count,
840 return ret == 0 ? count : ret;
843 static bool gtt_entry(struct intel_vgpu *vgpu, loff_t *ppos)
845 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
846 struct intel_gvt *gvt = vgpu->gvt;
849 /* Only allow MMIO GGTT entry access */
850 if (index != PCI_BASE_ADDRESS_0)
853 offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
854 intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
856 return (offset >= gvt->device_info.gtt_start_offset &&
857 offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
861 static ssize_t intel_vgpu_read(struct vfio_device *vfio_dev, char __user *buf,
862 size_t count, loff_t *ppos)
864 struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
865 unsigned int done = 0;
871 /* Only support GGTT entry 8 bytes read */
872 if (count >= 8 && !(*ppos % 8) &&
873 gtt_entry(vgpu, ppos)) {
876 ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
881 if (copy_to_user(buf, &val, sizeof(val)))
885 } else if (count >= 4 && !(*ppos % 4)) {
888 ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
893 if (copy_to_user(buf, &val, sizeof(val)))
897 } else if (count >= 2 && !(*ppos % 2)) {
900 ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
905 if (copy_to_user(buf, &val, sizeof(val)))
912 ret = intel_vgpu_rw(vgpu, &val, sizeof(val), ppos,
917 if (copy_to_user(buf, &val, sizeof(val)))
935 static ssize_t intel_vgpu_write(struct vfio_device *vfio_dev,
936 const char __user *buf,
937 size_t count, loff_t *ppos)
939 struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
940 unsigned int done = 0;
946 /* Only support GGTT entry 8 bytes write */
947 if (count >= 8 && !(*ppos % 8) &&
948 gtt_entry(vgpu, ppos)) {
951 if (copy_from_user(&val, buf, sizeof(val)))
954 ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
960 } else if (count >= 4 && !(*ppos % 4)) {
963 if (copy_from_user(&val, buf, sizeof(val)))
966 ret = intel_vgpu_rw(vgpu, (char *)&val, sizeof(val),
972 } else if (count >= 2 && !(*ppos % 2)) {
975 if (copy_from_user(&val, buf, sizeof(val)))
978 ret = intel_vgpu_rw(vgpu, (char *)&val,
979 sizeof(val), ppos, true);
987 if (copy_from_user(&val, buf, sizeof(val)))
990 ret = intel_vgpu_rw(vgpu, &val, sizeof(val),
1009 static int intel_vgpu_mmap(struct vfio_device *vfio_dev,
1010 struct vm_area_struct *vma)
1012 struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1015 unsigned long req_size, pgoff, req_start;
1018 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1019 if (index >= VFIO_PCI_ROM_REGION_INDEX)
1022 if (vma->vm_end < vma->vm_start)
1024 if ((vma->vm_flags & VM_SHARED) == 0)
1026 if (index != VFIO_PCI_BAR2_REGION_INDEX)
1029 pg_prot = vma->vm_page_prot;
1030 virtaddr = vma->vm_start;
1031 req_size = vma->vm_end - vma->vm_start;
1032 pgoff = vma->vm_pgoff &
1033 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1034 req_start = pgoff << PAGE_SHIFT;
1036 if (!intel_vgpu_in_aperture(vgpu, req_start))
1038 if (req_start + req_size >
1039 vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu))
1042 pgoff = (gvt_aperture_pa_base(vgpu->gvt) >> PAGE_SHIFT) + pgoff;
1044 return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1047 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1049 if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1055 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1056 unsigned int index, unsigned int start,
1057 unsigned int count, u32 flags,
1063 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1064 unsigned int index, unsigned int start,
1065 unsigned int count, u32 flags, void *data)
1070 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1071 unsigned int index, unsigned int start, unsigned int count,
1072 u32 flags, void *data)
1077 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1078 unsigned int index, unsigned int start, unsigned int count,
1079 u32 flags, void *data)
1081 struct eventfd_ctx *trigger;
1083 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1084 int fd = *(int *)data;
1086 trigger = eventfd_ctx_fdget(fd);
1087 if (IS_ERR(trigger)) {
1088 gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1089 return PTR_ERR(trigger);
1091 vgpu->msi_trigger = trigger;
1092 } else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1093 intel_vgpu_release_msi_eventfd_ctx(vgpu);
1098 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
1099 unsigned int index, unsigned int start, unsigned int count,
1102 int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1103 unsigned int start, unsigned int count, u32 flags,
1107 case VFIO_PCI_INTX_IRQ_INDEX:
1108 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1109 case VFIO_IRQ_SET_ACTION_MASK:
1110 func = intel_vgpu_set_intx_mask;
1112 case VFIO_IRQ_SET_ACTION_UNMASK:
1113 func = intel_vgpu_set_intx_unmask;
1115 case VFIO_IRQ_SET_ACTION_TRIGGER:
1116 func = intel_vgpu_set_intx_trigger;
1120 case VFIO_PCI_MSI_IRQ_INDEX:
1121 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1122 case VFIO_IRQ_SET_ACTION_MASK:
1123 case VFIO_IRQ_SET_ACTION_UNMASK:
1124 /* XXX Need masking support exported */
1126 case VFIO_IRQ_SET_ACTION_TRIGGER:
1127 func = intel_vgpu_set_msi_trigger;
1136 return func(vgpu, index, start, count, flags, data);
1139 static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
1142 struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1143 unsigned long minsz;
1145 gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1147 if (cmd == VFIO_DEVICE_GET_INFO) {
1148 struct vfio_device_info info;
1150 minsz = offsetofend(struct vfio_device_info, num_irqs);
1152 if (copy_from_user(&info, (void __user *)arg, minsz))
1155 if (info.argsz < minsz)
1158 info.flags = VFIO_DEVICE_FLAGS_PCI;
1159 info.flags |= VFIO_DEVICE_FLAGS_RESET;
1160 info.num_regions = VFIO_PCI_NUM_REGIONS +
1162 info.num_irqs = VFIO_PCI_NUM_IRQS;
1164 return copy_to_user((void __user *)arg, &info, minsz) ?
1167 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1168 struct vfio_region_info info;
1169 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1172 struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1176 minsz = offsetofend(struct vfio_region_info, offset);
1178 if (copy_from_user(&info, (void __user *)arg, minsz))
1181 if (info.argsz < minsz)
1184 switch (info.index) {
1185 case VFIO_PCI_CONFIG_REGION_INDEX:
1186 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1187 info.size = vgpu->gvt->device_info.cfg_space_size;
1188 info.flags = VFIO_REGION_INFO_FLAG_READ |
1189 VFIO_REGION_INFO_FLAG_WRITE;
1191 case VFIO_PCI_BAR0_REGION_INDEX:
1192 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1193 info.size = vgpu->cfg_space.bar[info.index].size;
1199 info.flags = VFIO_REGION_INFO_FLAG_READ |
1200 VFIO_REGION_INFO_FLAG_WRITE;
1202 case VFIO_PCI_BAR1_REGION_INDEX:
1203 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1207 case VFIO_PCI_BAR2_REGION_INDEX:
1208 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1209 info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1210 VFIO_REGION_INFO_FLAG_MMAP |
1211 VFIO_REGION_INFO_FLAG_READ |
1212 VFIO_REGION_INFO_FLAG_WRITE;
1213 info.size = gvt_aperture_sz(vgpu->gvt);
1215 sparse = kzalloc(struct_size(sparse, areas, nr_areas),
1220 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1221 sparse->header.version = 1;
1222 sparse->nr_areas = nr_areas;
1223 cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1224 sparse->areas[0].offset =
1225 PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1226 sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1229 case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1230 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1234 gvt_dbg_core("get region info bar:%d\n", info.index);
1237 case VFIO_PCI_ROM_REGION_INDEX:
1238 case VFIO_PCI_VGA_REGION_INDEX:
1239 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1243 gvt_dbg_core("get region info index:%d\n", info.index);
1247 struct vfio_region_info_cap_type cap_type = {
1248 .header.id = VFIO_REGION_INFO_CAP_TYPE,
1249 .header.version = 1 };
1251 if (info.index >= VFIO_PCI_NUM_REGIONS +
1255 array_index_nospec(info.index,
1256 VFIO_PCI_NUM_REGIONS +
1259 i = info.index - VFIO_PCI_NUM_REGIONS;
1262 VFIO_PCI_INDEX_TO_OFFSET(info.index);
1263 info.size = vgpu->region[i].size;
1264 info.flags = vgpu->region[i].flags;
1266 cap_type.type = vgpu->region[i].type;
1267 cap_type.subtype = vgpu->region[i].subtype;
1269 ret = vfio_info_add_capability(&caps,
1277 if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1278 switch (cap_type_id) {
1279 case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1280 ret = vfio_info_add_capability(&caps,
1282 struct_size(sparse, areas,
1296 info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1297 if (info.argsz < sizeof(info) + caps.size) {
1298 info.argsz = sizeof(info) + caps.size;
1299 info.cap_offset = 0;
1301 vfio_info_cap_shift(&caps, sizeof(info));
1302 if (copy_to_user((void __user *)arg +
1303 sizeof(info), caps.buf,
1309 info.cap_offset = sizeof(info);
1316 return copy_to_user((void __user *)arg, &info, minsz) ?
1318 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1319 struct vfio_irq_info info;
1321 minsz = offsetofend(struct vfio_irq_info, count);
1323 if (copy_from_user(&info, (void __user *)arg, minsz))
1326 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1329 switch (info.index) {
1330 case VFIO_PCI_INTX_IRQ_INDEX:
1331 case VFIO_PCI_MSI_IRQ_INDEX:
1337 info.flags = VFIO_IRQ_INFO_EVENTFD;
1339 info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1341 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1342 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1343 VFIO_IRQ_INFO_AUTOMASKED);
1345 info.flags |= VFIO_IRQ_INFO_NORESIZE;
1347 return copy_to_user((void __user *)arg, &info, minsz) ?
1349 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
1350 struct vfio_irq_set hdr;
1353 size_t data_size = 0;
1355 minsz = offsetofend(struct vfio_irq_set, count);
1357 if (copy_from_user(&hdr, (void __user *)arg, minsz))
1360 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1361 int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1363 ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1364 VFIO_PCI_NUM_IRQS, &data_size);
1366 gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1370 data = memdup_user((void __user *)(arg + minsz),
1373 return PTR_ERR(data);
1377 ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1378 hdr.start, hdr.count, data);
1382 } else if (cmd == VFIO_DEVICE_RESET) {
1383 intel_gvt_reset_vgpu(vgpu);
1385 } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1386 struct vfio_device_gfx_plane_info dmabuf;
1389 minsz = offsetofend(struct vfio_device_gfx_plane_info,
1391 if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1393 if (dmabuf.argsz < minsz)
1396 ret = intel_vgpu_query_plane(vgpu, &dmabuf);
1400 return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1402 } else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1405 if (get_user(dmabuf_id, (__u32 __user *)arg))
1407 return intel_vgpu_get_dmabuf(vgpu, dmabuf_id);
1414 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1417 struct intel_vgpu *vgpu = dev_get_drvdata(dev);
1419 return sprintf(buf, "%d\n", vgpu->id);
1422 static DEVICE_ATTR_RO(vgpu_id);
1424 static struct attribute *intel_vgpu_attrs[] = {
1425 &dev_attr_vgpu_id.attr,
1429 static const struct attribute_group intel_vgpu_group = {
1430 .name = "intel_vgpu",
1431 .attrs = intel_vgpu_attrs,
1434 static const struct attribute_group *intel_vgpu_groups[] = {
1439 static int intel_vgpu_init_dev(struct vfio_device *vfio_dev)
1441 struct mdev_device *mdev = to_mdev_device(vfio_dev->dev);
1442 struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1443 struct intel_vgpu_type *type =
1444 container_of(mdev->type, struct intel_vgpu_type, type);
1447 vgpu->gvt = kdev_to_i915(mdev->type->parent->dev)->gvt;
1448 ret = intel_gvt_create_vgpu(vgpu, type->conf);
1452 kvmgt_protect_table_init(vgpu);
1453 gvt_cache_init(vgpu);
1458 static void intel_vgpu_release_dev(struct vfio_device *vfio_dev)
1460 struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
1462 intel_gvt_destroy_vgpu(vgpu);
1465 static const struct vfio_device_ops intel_vgpu_dev_ops = {
1466 .init = intel_vgpu_init_dev,
1467 .release = intel_vgpu_release_dev,
1468 .open_device = intel_vgpu_open_device,
1469 .close_device = intel_vgpu_close_device,
1470 .read = intel_vgpu_read,
1471 .write = intel_vgpu_write,
1472 .mmap = intel_vgpu_mmap,
1473 .ioctl = intel_vgpu_ioctl,
1474 .dma_unmap = intel_vgpu_dma_unmap,
1475 .bind_iommufd = vfio_iommufd_emulated_bind,
1476 .unbind_iommufd = vfio_iommufd_emulated_unbind,
1477 .attach_ioas = vfio_iommufd_emulated_attach_ioas,
1480 static int intel_vgpu_probe(struct mdev_device *mdev)
1482 struct intel_vgpu *vgpu;
1485 vgpu = vfio_alloc_device(intel_vgpu, vfio_device, &mdev->dev,
1486 &intel_vgpu_dev_ops);
1488 gvt_err("failed to create intel vgpu: %ld\n", PTR_ERR(vgpu));
1489 return PTR_ERR(vgpu);
1492 dev_set_drvdata(&mdev->dev, vgpu);
1493 ret = vfio_register_emulated_iommu_dev(&vgpu->vfio_device);
1497 gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
1498 dev_name(mdev_dev(mdev)));
1502 vfio_put_device(&vgpu->vfio_device);
1506 static void intel_vgpu_remove(struct mdev_device *mdev)
1508 struct intel_vgpu *vgpu = dev_get_drvdata(&mdev->dev);
1510 vfio_unregister_group_dev(&vgpu->vfio_device);
1511 vfio_put_device(&vgpu->vfio_device);
1514 static unsigned int intel_vgpu_get_available(struct mdev_type *mtype)
1516 struct intel_vgpu_type *type =
1517 container_of(mtype, struct intel_vgpu_type, type);
1518 struct intel_gvt *gvt = kdev_to_i915(mtype->parent->dev)->gvt;
1519 unsigned int low_gm_avail, high_gm_avail, fence_avail;
1521 mutex_lock(&gvt->lock);
1522 low_gm_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE -
1523 gvt->gm.vgpu_allocated_low_gm_size;
1524 high_gm_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE -
1525 gvt->gm.vgpu_allocated_high_gm_size;
1526 fence_avail = gvt_fence_sz(gvt) - HOST_FENCE -
1527 gvt->fence.vgpu_allocated_fence_num;
1528 mutex_unlock(&gvt->lock);
1530 return min3(low_gm_avail / type->conf->low_mm,
1531 high_gm_avail / type->conf->high_mm,
1532 fence_avail / type->conf->fence);
1535 static struct mdev_driver intel_vgpu_mdev_driver = {
1536 .device_api = VFIO_DEVICE_API_PCI_STRING,
1538 .name = "intel_vgpu_mdev",
1539 .owner = THIS_MODULE,
1540 .dev_groups = intel_vgpu_groups,
1542 .probe = intel_vgpu_probe,
1543 .remove = intel_vgpu_remove,
1544 .get_available = intel_vgpu_get_available,
1545 .show_description = intel_vgpu_show_description,
1548 int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn)
1550 struct kvm *kvm = info->vfio_device.kvm;
1551 struct kvm_memory_slot *slot;
1554 if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, info->status))
1557 if (kvmgt_gfn_is_write_protected(info, gfn))
1560 idx = srcu_read_lock(&kvm->srcu);
1561 slot = gfn_to_memslot(kvm, gfn);
1563 srcu_read_unlock(&kvm->srcu, idx);
1567 write_lock(&kvm->mmu_lock);
1568 kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1569 write_unlock(&kvm->mmu_lock);
1571 srcu_read_unlock(&kvm->srcu, idx);
1573 kvmgt_protect_table_add(info, gfn);
1577 int intel_gvt_page_track_remove(struct intel_vgpu *info, u64 gfn)
1579 struct kvm *kvm = info->vfio_device.kvm;
1580 struct kvm_memory_slot *slot;
1583 if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, info->status))
1586 if (!kvmgt_gfn_is_write_protected(info, gfn))
1589 idx = srcu_read_lock(&kvm->srcu);
1590 slot = gfn_to_memslot(kvm, gfn);
1592 srcu_read_unlock(&kvm->srcu, idx);
1596 write_lock(&kvm->mmu_lock);
1597 kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1598 write_unlock(&kvm->mmu_lock);
1599 srcu_read_unlock(&kvm->srcu, idx);
1601 kvmgt_protect_table_del(info, gfn);
1605 static void kvmgt_page_track_write(gpa_t gpa, const u8 *val, int len,
1606 struct kvm_page_track_notifier_node *node)
1608 struct intel_vgpu *info =
1609 container_of(node, struct intel_vgpu, track_node);
1611 mutex_lock(&info->vgpu_lock);
1613 if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1614 intel_vgpu_page_track_handler(info, gpa,
1617 mutex_unlock(&info->vgpu_lock);
1620 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1621 struct kvm_memory_slot *slot,
1622 struct kvm_page_track_notifier_node *node)
1626 struct intel_vgpu *info =
1627 container_of(node, struct intel_vgpu, track_node);
1629 mutex_lock(&info->vgpu_lock);
1631 for (i = 0; i < slot->npages; i++) {
1632 gfn = slot->base_gfn + i;
1633 if (kvmgt_gfn_is_write_protected(info, gfn)) {
1634 write_lock(&kvm->mmu_lock);
1635 kvm_slot_page_track_remove_page(kvm, slot, gfn,
1636 KVM_PAGE_TRACK_WRITE);
1637 write_unlock(&kvm->mmu_lock);
1639 kvmgt_protect_table_del(info, gfn);
1642 mutex_unlock(&info->vgpu_lock);
1645 void intel_vgpu_detach_regions(struct intel_vgpu *vgpu)
1652 for (i = 0; i < vgpu->num_regions; i++)
1653 if (vgpu->region[i].ops->release)
1654 vgpu->region[i].ops->release(vgpu,
1656 vgpu->num_regions = 0;
1657 kfree(vgpu->region);
1658 vgpu->region = NULL;
1661 int intel_gvt_dma_map_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
1662 unsigned long size, dma_addr_t *dma_addr)
1664 struct gvt_dma *entry;
1667 if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
1670 mutex_lock(&vgpu->cache_lock);
1672 entry = __gvt_cache_find_gfn(vgpu, gfn);
1674 ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1678 ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
1681 } else if (entry->size != size) {
1682 /* the same gfn with different size: unmap and re-map */
1683 gvt_dma_unmap_page(vgpu, gfn, entry->dma_addr, entry->size);
1684 __gvt_cache_remove_entry(vgpu, entry);
1686 ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1690 ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
1694 kref_get(&entry->ref);
1695 *dma_addr = entry->dma_addr;
1698 mutex_unlock(&vgpu->cache_lock);
1702 gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
1704 mutex_unlock(&vgpu->cache_lock);
1708 int intel_gvt_dma_pin_guest_page(struct intel_vgpu *vgpu, dma_addr_t dma_addr)
1710 struct gvt_dma *entry;
1713 if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
1716 mutex_lock(&vgpu->cache_lock);
1717 entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
1719 kref_get(&entry->ref);
1722 mutex_unlock(&vgpu->cache_lock);
1727 static void __gvt_dma_release(struct kref *ref)
1729 struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
1731 gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
1733 __gvt_cache_remove_entry(entry->vgpu, entry);
1736 void intel_gvt_dma_unmap_guest_page(struct intel_vgpu *vgpu,
1737 dma_addr_t dma_addr)
1739 struct gvt_dma *entry;
1741 if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
1744 mutex_lock(&vgpu->cache_lock);
1745 entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
1747 kref_put(&entry->ref, __gvt_dma_release);
1748 mutex_unlock(&vgpu->cache_lock);
1751 static void init_device_info(struct intel_gvt *gvt)
1753 struct intel_gvt_device_info *info = &gvt->device_info;
1754 struct pci_dev *pdev = to_pci_dev(gvt->gt->i915->drm.dev);
1756 info->max_support_vgpus = 8;
1757 info->cfg_space_size = PCI_CFG_SPACE_EXP_SIZE;
1758 info->mmio_size = 2 * 1024 * 1024;
1760 info->gtt_start_offset = 8 * 1024 * 1024;
1761 info->gtt_entry_size = 8;
1762 info->gtt_entry_size_shift = 3;
1763 info->gmadr_bytes_in_cmd = 8;
1764 info->max_surface_size = 36 * 1024 * 1024;
1765 info->msi_cap_offset = pdev->msi_cap;
1768 static void intel_gvt_test_and_emulate_vblank(struct intel_gvt *gvt)
1770 struct intel_vgpu *vgpu;
1773 mutex_lock(&gvt->lock);
1774 idr_for_each_entry((&(gvt)->vgpu_idr), (vgpu), (id)) {
1775 if (test_and_clear_bit(INTEL_GVT_REQUEST_EMULATE_VBLANK + id,
1776 (void *)&gvt->service_request)) {
1777 if (test_bit(INTEL_VGPU_STATUS_ACTIVE, vgpu->status))
1778 intel_vgpu_emulate_vblank(vgpu);
1781 mutex_unlock(&gvt->lock);
1784 static int gvt_service_thread(void *data)
1786 struct intel_gvt *gvt = (struct intel_gvt *)data;
1789 gvt_dbg_core("service thread start\n");
1791 while (!kthread_should_stop()) {
1792 ret = wait_event_interruptible(gvt->service_thread_wq,
1793 kthread_should_stop() || gvt->service_request);
1795 if (kthread_should_stop())
1798 if (WARN_ONCE(ret, "service thread is waken up by signal.\n"))
1801 intel_gvt_test_and_emulate_vblank(gvt);
1803 if (test_bit(INTEL_GVT_REQUEST_SCHED,
1804 (void *)&gvt->service_request) ||
1805 test_bit(INTEL_GVT_REQUEST_EVENT_SCHED,
1806 (void *)&gvt->service_request)) {
1807 intel_gvt_schedule(gvt);
1814 static void clean_service_thread(struct intel_gvt *gvt)
1816 kthread_stop(gvt->service_thread);
1819 static int init_service_thread(struct intel_gvt *gvt)
1821 init_waitqueue_head(&gvt->service_thread_wq);
1823 gvt->service_thread = kthread_run(gvt_service_thread,
1824 gvt, "gvt_service_thread");
1825 if (IS_ERR(gvt->service_thread)) {
1826 gvt_err("fail to start service thread.\n");
1827 return PTR_ERR(gvt->service_thread);
1833 * intel_gvt_clean_device - clean a GVT device
1834 * @i915: i915 private
1836 * This function is called at the driver unloading stage, to free the
1837 * resources owned by a GVT device.
1840 static void intel_gvt_clean_device(struct drm_i915_private *i915)
1842 struct intel_gvt *gvt = fetch_and_zero(&i915->gvt);
1844 if (drm_WARN_ON(&i915->drm, !gvt))
1847 mdev_unregister_parent(&gvt->parent);
1848 intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu);
1849 intel_gvt_clean_vgpu_types(gvt);
1851 intel_gvt_debugfs_clean(gvt);
1852 clean_service_thread(gvt);
1853 intel_gvt_clean_cmd_parser(gvt);
1854 intel_gvt_clean_sched_policy(gvt);
1855 intel_gvt_clean_workload_scheduler(gvt);
1856 intel_gvt_clean_gtt(gvt);
1857 intel_gvt_free_firmware(gvt);
1858 intel_gvt_clean_mmio_info(gvt);
1859 idr_destroy(&gvt->vgpu_idr);
1865 * intel_gvt_init_device - initialize a GVT device
1866 * @i915: drm i915 private data
1868 * This function is called at the initialization stage, to initialize
1869 * necessary GVT components.
1872 * Zero on success, negative error code if failed.
1875 static int intel_gvt_init_device(struct drm_i915_private *i915)
1877 struct intel_gvt *gvt;
1878 struct intel_vgpu *vgpu;
1881 if (drm_WARN_ON(&i915->drm, i915->gvt))
1884 gvt = kzalloc(sizeof(struct intel_gvt), GFP_KERNEL);
1888 gvt_dbg_core("init gvt device\n");
1890 idr_init_base(&gvt->vgpu_idr, 1);
1891 spin_lock_init(&gvt->scheduler.mmio_context_lock);
1892 mutex_init(&gvt->lock);
1893 mutex_init(&gvt->sched_lock);
1894 gvt->gt = to_gt(i915);
1897 init_device_info(gvt);
1899 ret = intel_gvt_setup_mmio_info(gvt);
1903 intel_gvt_init_engine_mmio_context(gvt);
1905 ret = intel_gvt_load_firmware(gvt);
1907 goto out_clean_mmio_info;
1909 ret = intel_gvt_init_irq(gvt);
1911 goto out_free_firmware;
1913 ret = intel_gvt_init_gtt(gvt);
1915 goto out_free_firmware;
1917 ret = intel_gvt_init_workload_scheduler(gvt);
1921 ret = intel_gvt_init_sched_policy(gvt);
1923 goto out_clean_workload_scheduler;
1925 ret = intel_gvt_init_cmd_parser(gvt);
1927 goto out_clean_sched_policy;
1929 ret = init_service_thread(gvt);
1931 goto out_clean_cmd_parser;
1933 ret = intel_gvt_init_vgpu_types(gvt);
1935 goto out_clean_thread;
1937 vgpu = intel_gvt_create_idle_vgpu(gvt);
1939 ret = PTR_ERR(vgpu);
1940 gvt_err("failed to create idle vgpu\n");
1941 goto out_clean_types;
1943 gvt->idle_vgpu = vgpu;
1945 intel_gvt_debugfs_init(gvt);
1947 ret = mdev_register_parent(&gvt->parent, i915->drm.dev,
1948 &intel_vgpu_mdev_driver,
1949 gvt->mdev_types, gvt->num_types);
1951 goto out_destroy_idle_vgpu;
1953 gvt_dbg_core("gvt device initialization is done\n");
1956 out_destroy_idle_vgpu:
1957 intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu);
1958 intel_gvt_debugfs_clean(gvt);
1960 intel_gvt_clean_vgpu_types(gvt);
1962 clean_service_thread(gvt);
1963 out_clean_cmd_parser:
1964 intel_gvt_clean_cmd_parser(gvt);
1965 out_clean_sched_policy:
1966 intel_gvt_clean_sched_policy(gvt);
1967 out_clean_workload_scheduler:
1968 intel_gvt_clean_workload_scheduler(gvt);
1970 intel_gvt_clean_gtt(gvt);
1972 intel_gvt_free_firmware(gvt);
1973 out_clean_mmio_info:
1974 intel_gvt_clean_mmio_info(gvt);
1976 idr_destroy(&gvt->vgpu_idr);
1982 static void intel_gvt_pm_resume(struct drm_i915_private *i915)
1984 struct intel_gvt *gvt = i915->gvt;
1986 intel_gvt_restore_fence(gvt);
1987 intel_gvt_restore_mmio(gvt);
1988 intel_gvt_restore_ggtt(gvt);
1991 static const struct intel_vgpu_ops intel_gvt_vgpu_ops = {
1992 .init_device = intel_gvt_init_device,
1993 .clean_device = intel_gvt_clean_device,
1994 .pm_resume = intel_gvt_pm_resume,
1997 static int __init kvmgt_init(void)
2001 ret = intel_gvt_set_ops(&intel_gvt_vgpu_ops);
2005 ret = mdev_register_driver(&intel_vgpu_mdev_driver);
2007 intel_gvt_clear_ops(&intel_gvt_vgpu_ops);
2011 static void __exit kvmgt_exit(void)
2013 mdev_unregister_driver(&intel_vgpu_mdev_driver);
2014 intel_gvt_clear_ops(&intel_gvt_vgpu_ops);
2017 module_init(kvmgt_init);
2018 module_exit(kvmgt_exit);
2020 MODULE_LICENSE("GPL and additional rights");
2021 MODULE_AUTHOR("Intel Corporation");