Merge tag 'for-linus-6.1-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 12 Oct 2022 21:39:38 +0000 (14:39 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 12 Oct 2022 21:39:38 +0000 (14:39 -0700)
Pull xen updates from Juergen Gross:

 - Some minor typo fixes

 - A fix of the Xen pcifront driver for supporting the device model to
   run in a Linux stub domain

 - A cleanup of the pcifront driver

 - A series to enable grant-based virtio with Xen on x86

 - A cleanup of Xen PV guests to distinguish between safe and faulting
   MSR accesses

 - Two fixes of the Xen gntdev driver

 - Two fixes of the new xen grant DMA driver

* tag 'for-linus-6.1-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip:
  xen: Kconfig: Fix spelling mistake "Maxmium" -> "Maximum"
  xen/pv: support selecting safe/unsafe msr accesses
  xen/pv: refactor msr access functions to support safe and unsafe accesses
  xen/pv: fix vendor checks for pmu emulation
  xen/pv: add fault recovery control to pmu msr accesses
  xen/virtio: enable grant based virtio on x86
  xen/virtio: use dom0 as default backend for CONFIG_XEN_VIRTIO_FORCE_GRANT
  xen/virtio: restructure xen grant dma setup
  xen/pcifront: move xenstore config scanning into sub-function
  xen/gntdev: Accommodate VMA splitting
  xen/gntdev: Prevent leaking grants
  xen/virtio: Fix potential deadlock when accessing xen_grant_dma_devices
  xen/virtio: Fix n_pages calculation in xen_grant_dma_map(unmap)_page()
  xen/xenbus: Fix spelling mistake "hardward" -> "hardware"
  xen-pcifront: Handle missed Connected state

12 files changed:
Documentation/admin-guide/kernel-parameters.txt
arch/x86/xen/Kconfig
arch/x86/xen/enlighten_hvm.c
arch/x86/xen/enlighten_pv.c
arch/x86/xen/pmu.c
drivers/pci/xen-pcifront.c
drivers/xen/Kconfig
drivers/xen/gntdev-common.h
drivers/xen/gntdev.c
drivers/xen/grant-dma-ops.c
drivers/xen/xen-pciback/xenbus.c
include/xen/xen-ops.h

index 69b1533..a465d52 100644 (file)
                        Crash from Xen panic notifier, without executing late
                        panic() code such as dumping handler.
 
+       xen_msr_safe=   [X86,XEN]
+                       Format: <bool>
+                       Select whether to always use non-faulting (safe) MSR
+                       access functions when running as Xen PV guest. The
+                       default value is controlled by CONFIG_XEN_PV_MSR_SAFE.
+
        xen_nopvspin    [X86,XEN]
                        Disables the qspinlock slowpath using Xen PV optimizations.
                        This parameter is obsoleted by "nopvspin" parameter, which
index 85246dd..9b1ec5d 100644 (file)
@@ -92,3 +92,12 @@ config XEN_DOM0
        select X86_X2APIC if XEN_PVH && X86_64
        help
          Support running as a Xen Dom0 guest.
+
+config XEN_PV_MSR_SAFE
+       bool "Always use safe MSR accesses in PV guests"
+       default y
+       depends on XEN_PV
+       help
+         Use safe (not faulting) MSR access functions even if the MSR access
+         should not fault anyway.
+         The default can be changed by using the "xen_msr_safe" boot parameter.
index 1c1ac41..c1cd28e 100644 (file)
@@ -212,7 +212,7 @@ static void __init xen_hvm_guest_init(void)
                return;
 
        if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT))
-               virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+               virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc);
 
        init_hvm_pv_info();
 
index 9b1a58d..f82857e 100644 (file)
@@ -108,11 +108,21 @@ struct tls_descs {
  */
 static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
 
+static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE);
+
+static int __init parse_xen_msr_safe(char *str)
+{
+       if (str)
+               return strtobool(str, &xen_msr_safe);
+       return -EINVAL;
+}
+early_param("xen_msr_safe", parse_xen_msr_safe);
+
 static void __init xen_pv_init_platform(void)
 {
        /* PV guests can't operate virtio devices without grants. */
        if (IS_ENABLED(CONFIG_XEN_VIRTIO))
-               virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+               virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc);
 
        populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
 
@@ -917,14 +927,18 @@ static void xen_write_cr4(unsigned long cr4)
        native_write_cr4(cr4);
 }
 
-static u64 xen_read_msr_safe(unsigned int msr, int *err)
+static u64 xen_do_read_msr(unsigned int msr, int *err)
 {
-       u64 val;
+       u64 val = 0;    /* Avoid uninitialized value for safe variant. */
 
        if (pmu_msr_read(msr, &val, err))
                return val;
 
-       val = native_read_msr_safe(msr, err);
+       if (err)
+               val = native_read_msr_safe(msr, err);
+       else
+               val = native_read_msr(msr);
+
        switch (msr) {
        case MSR_IA32_APICBASE:
                val &= ~X2APIC_ENABLE;
@@ -933,23 +947,39 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
        return val;
 }
 
-static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+static void set_seg(unsigned int which, unsigned int low, unsigned int high,
+                   int *err)
 {
-       int ret;
-       unsigned int which;
-       u64 base;
+       u64 base = ((u64)high << 32) | low;
 
-       ret = 0;
+       if (HYPERVISOR_set_segment_base(which, base) == 0)
+               return;
 
+       if (err)
+               *err = -EIO;
+       else
+               WARN(1, "Xen set_segment_base(%u, %llx) failed\n", which, base);
+}
+
+/*
+ * Support write_msr_safe() and write_msr() semantics.
+ * With err == NULL write_msr() semantics are selected.
+ * Supplying an err pointer requires err to be pre-initialized with 0.
+ */
+static void xen_do_write_msr(unsigned int msr, unsigned int low,
+                            unsigned int high, int *err)
+{
        switch (msr) {
-       case MSR_FS_BASE:               which = SEGBASE_FS; goto set;
-       case MSR_KERNEL_GS_BASE:        which = SEGBASE_GS_USER; goto set;
-       case MSR_GS_BASE:               which = SEGBASE_GS_KERNEL; goto set;
-
-       set:
-               base = ((u64)high << 32) | low;
-               if (HYPERVISOR_set_segment_base(which, base) != 0)
-                       ret = -EIO;
+       case MSR_FS_BASE:
+               set_seg(SEGBASE_FS, low, high, err);
+               break;
+
+       case MSR_KERNEL_GS_BASE:
+               set_seg(SEGBASE_GS_USER, low, high, err);
+               break;
+
+       case MSR_GS_BASE:
+               set_seg(SEGBASE_GS_KERNEL, low, high, err);
                break;
 
        case MSR_STAR:
@@ -965,31 +995,42 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
                break;
 
        default:
-               if (!pmu_msr_write(msr, low, high, &ret))
-                       ret = native_write_msr_safe(msr, low, high);
+               if (!pmu_msr_write(msr, low, high, err)) {
+                       if (err)
+                               *err = native_write_msr_safe(msr, low, high);
+                       else
+                               native_write_msr(msr, low, high);
+               }
        }
+}
 
-       return ret;
+static u64 xen_read_msr_safe(unsigned int msr, int *err)
+{
+       return xen_do_read_msr(msr, err);
+}
+
+static int xen_write_msr_safe(unsigned int msr, unsigned int low,
+                             unsigned int high)
+{
+       int err = 0;
+
+       xen_do_write_msr(msr, low, high, &err);
+
+       return err;
 }
 
 static u64 xen_read_msr(unsigned int msr)
 {
-       /*
-        * This will silently swallow a #GP from RDMSR.  It may be worth
-        * changing that.
-        */
        int err;
 
-       return xen_read_msr_safe(msr, &err);
+       return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
 }
 
 static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
 {
-       /*
-        * This will silently swallow a #GP from WRMSR.  It may be worth
-        * changing that.
-        */
-       xen_write_msr_safe(msr, low, high);
+       int err;
+
+       xen_do_write_msr(msr, low, high, xen_msr_safe ? &err : NULL);
 }
 
 /* This is called once we have the cpu_possible_mask */
index 21ecbe7..68aff13 100644 (file)
@@ -131,6 +131,10 @@ static inline uint32_t get_fam15h_addr(u32 addr)
 
 static inline bool is_amd_pmu_msr(unsigned int msr)
 {
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+           boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+               return false;
+
        if ((msr >= MSR_F15H_PERF_CTL &&
             msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
            (msr >= MSR_K7_EVNTSEL0 &&
@@ -140,10 +144,15 @@ static inline bool is_amd_pmu_msr(unsigned int msr)
        return false;
 }
 
-static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
+static bool is_intel_pmu_msr(u32 msr_index, int *type, int *index)
 {
        u32 msr_index_pmc;
 
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+           boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR &&
+           boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
+               return false;
+
        switch (msr_index) {
        case MSR_CORE_PERF_FIXED_CTR_CTRL:
        case MSR_IA32_DS_AREA:
@@ -290,48 +299,52 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
        return false;
 }
 
+static bool pmu_msr_chk_emulated(unsigned int msr, uint64_t *val, bool is_read,
+                                bool *emul)
+{
+       int type, index;
+
+       if (is_amd_pmu_msr(msr))
+               *emul = xen_amd_pmu_emulate(msr, val, is_read);
+       else if (is_intel_pmu_msr(msr, &type, &index))
+               *emul = xen_intel_pmu_emulate(msr, val, type, index, is_read);
+       else
+               return false;
+
+       return true;
+}
+
 bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
 {
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
-               if (is_amd_pmu_msr(msr)) {
-                       if (!xen_amd_pmu_emulate(msr, val, 1))
-                               *val = native_read_msr_safe(msr, err);
-                       return true;
-               }
-       } else {
-               int type, index;
+       bool emulated;
 
-               if (is_intel_pmu_msr(msr, &type, &index)) {
-                       if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
-                               *val = native_read_msr_safe(msr, err);
-                       return true;
-               }
+       if (!pmu_msr_chk_emulated(msr, val, true, &emulated))
+               return false;
+
+       if (!emulated) {
+               *val = err ? native_read_msr_safe(msr, err)
+                          : native_read_msr(msr);
        }
 
-       return false;
+       return true;
 }
 
 bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
 {
        uint64_t val = ((uint64_t)high << 32) | low;
+       bool emulated;
 
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
-               if (is_amd_pmu_msr(msr)) {
-                       if (!xen_amd_pmu_emulate(msr, &val, 0))
-                               *err = native_write_msr_safe(msr, low, high);
-                       return true;
-               }
-       } else {
-               int type, index;
+       if (!pmu_msr_chk_emulated(msr, &val, false, &emulated))
+               return false;
 
-               if (is_intel_pmu_msr(msr, &type, &index)) {
-                       if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
-                               *err = native_write_msr_safe(msr, low, high);
-                       return true;
-               }
+       if (!emulated) {
+               if (err)
+                       *err = native_write_msr_safe(msr, low, high);
+               else
+                       native_write_msr(msr, low, high);
        }
 
-       return false;
+       return true;
 }
 
 static unsigned long long xen_amd_read_pmc(int counter)
index 689271c..7378e2f 100644 (file)
@@ -521,24 +521,14 @@ static int pcifront_rescan_root(struct pcifront_device *pdev,
        int err;
        struct pci_bus *b;
 
-#ifndef CONFIG_PCI_DOMAINS
-       if (domain != 0) {
-               dev_err(&pdev->xdev->dev,
-                       "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
-               dev_err(&pdev->xdev->dev,
-                       "Please compile with CONFIG_PCI_DOMAINS\n");
-               return -EINVAL;
-       }
-#endif
-
-       dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
-                domain, bus);
-
        b = pci_find_bus(domain, bus);
        if (!b)
                /* If the bus is unknown, create it. */
                return pcifront_scan_root(pdev, domain, bus);
 
+       dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
+                domain, bus);
+
        err = pcifront_scan_bus(pdev, domain, bus, b);
 
        /* Claim resources before going "live" with our devices */
@@ -819,76 +809,73 @@ out:
        return err;
 }
 
-static int pcifront_try_connect(struct pcifront_device *pdev)
+static void pcifront_connect(struct pcifront_device *pdev)
 {
-       int err = -EFAULT;
+       int err;
        int i, num_roots, len;
        char str[64];
        unsigned int domain, bus;
 
-
-       /* Only connect once */
-       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
-           XenbusStateInitialised)
-               goto out;
-
-       err = pcifront_connect_and_init_dma(pdev);
-       if (err && err != -EEXIST) {
-               xenbus_dev_fatal(pdev->xdev, err,
-                                "Error setting up PCI Frontend");
-               goto out;
-       }
-
        err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
                           "root_num", "%d", &num_roots);
        if (err == -ENOENT) {
                xenbus_dev_error(pdev->xdev, err,
                                 "No PCI Roots found, trying 0000:00");
-               err = pcifront_scan_root(pdev, 0, 0);
+               err = pcifront_rescan_root(pdev, 0, 0);
                if (err) {
                        xenbus_dev_fatal(pdev->xdev, err,
                                         "Error scanning PCI root 0000:00");
-                       goto out;
+                       return;
                }
                num_roots = 0;
        } else if (err != 1) {
-               if (err == 0)
-                       err = -EINVAL;
-               xenbus_dev_fatal(pdev->xdev, err,
+               xenbus_dev_fatal(pdev->xdev, err >= 0 ? -EINVAL : err,
                                 "Error reading number of PCI roots");
-               goto out;
+               return;
        }
 
        for (i = 0; i < num_roots; i++) {
                len = snprintf(str, sizeof(str), "root-%d", i);
-               if (unlikely(len >= (sizeof(str) - 1))) {
-                       err = -ENOMEM;
-                       goto out;
-               }
+               if (unlikely(len >= (sizeof(str) - 1)))
+                       return;
 
                err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
                                   "%x:%x", &domain, &bus);
                if (err != 2) {
-                       if (err >= 0)
-                               err = -EINVAL;
-                       xenbus_dev_fatal(pdev->xdev, err,
+                       xenbus_dev_fatal(pdev->xdev, err >= 0 ? -EINVAL : err,
                                         "Error reading PCI root %d", i);
-                       goto out;
+                       return;
                }
 
-               err = pcifront_scan_root(pdev, domain, bus);
+               err = pcifront_rescan_root(pdev, domain, bus);
                if (err) {
                        xenbus_dev_fatal(pdev->xdev, err,
                                         "Error scanning PCI root %04x:%02x",
                                         domain, bus);
-                       goto out;
+                       return;
                }
        }
 
-       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+       xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+}
 
-out:
-       return err;
+static void pcifront_try_connect(struct pcifront_device *pdev)
+{
+       int err;
+
+       /* Only connect once */
+       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+           XenbusStateInitialised)
+               return;
+
+       err = pcifront_connect_and_init_dma(pdev);
+       if (err && err != -EEXIST) {
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error setting up PCI Frontend");
+               return;
+       }
+
+       pcifront_connect(pdev);
 }
 
 static int pcifront_try_disconnect(struct pcifront_device *pdev)
@@ -914,80 +901,37 @@ out:
        return err;
 }
 
-static int pcifront_attach_devices(struct pcifront_device *pdev)
+static void pcifront_attach_devices(struct pcifront_device *pdev)
 {
-       int err = -EFAULT;
-       int i, num_roots, len;
-       unsigned int domain, bus;
-       char str[64];
-
-       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+       if (xenbus_read_driver_state(pdev->xdev->nodename) ==
            XenbusStateReconfiguring)
-               goto out;
-
-       err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
-                          "root_num", "%d", &num_roots);
-       if (err == -ENOENT) {
-               xenbus_dev_error(pdev->xdev, err,
-                                "No PCI Roots found, trying 0000:00");
-               err = pcifront_rescan_root(pdev, 0, 0);
-               if (err) {
-                       xenbus_dev_fatal(pdev->xdev, err,
-                                        "Error scanning PCI root 0000:00");
-                       goto out;
-               }
-               num_roots = 0;
-       } else if (err != 1) {
-               if (err == 0)
-                       err = -EINVAL;
-               xenbus_dev_fatal(pdev->xdev, err,
-                                "Error reading number of PCI roots");
-               goto out;
-       }
-
-       for (i = 0; i < num_roots; i++) {
-               len = snprintf(str, sizeof(str), "root-%d", i);
-               if (unlikely(len >= (sizeof(str) - 1))) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-
-               err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
-                                  "%x:%x", &domain, &bus);
-               if (err != 2) {
-                       if (err >= 0)
-                               err = -EINVAL;
-                       xenbus_dev_fatal(pdev->xdev, err,
-                                        "Error reading PCI root %d", i);
-                       goto out;
-               }
-
-               err = pcifront_rescan_root(pdev, domain, bus);
-               if (err) {
-                       xenbus_dev_fatal(pdev->xdev, err,
-                                        "Error scanning PCI root %04x:%02x",
-                                        domain, bus);
-                       goto out;
-               }
-       }
-
-       xenbus_switch_state(pdev->xdev, XenbusStateConnected);
-
-out:
-       return err;
+               pcifront_connect(pdev);
 }
 
 static int pcifront_detach_devices(struct pcifront_device *pdev)
 {
        int err = 0;
        int i, num_devs;
+       enum xenbus_state state;
        unsigned int domain, bus, slot, func;
        struct pci_dev *pci_dev;
        char str[64];
 
-       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
-           XenbusStateConnected)
+       state = xenbus_read_driver_state(pdev->xdev->nodename);
+       if (state == XenbusStateInitialised) {
+               dev_dbg(&pdev->xdev->dev, "Handle skipped connect.\n");
+               /* We missed Connected and need to initialize. */
+               err = pcifront_connect_and_init_dma(pdev);
+               if (err && err != -EEXIST) {
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "Error setting up PCI Frontend");
+                       goto out;
+               }
+
+               goto out_switch_state;
+       } else if (state != XenbusStateConnected) {
                goto out;
+       }
 
        err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d",
                           &num_devs);
@@ -1048,6 +992,7 @@ static int pcifront_detach_devices(struct pcifront_device *pdev)
                        domain, bus, slot, func);
        }
 
+ out_switch_state:
        err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring);
 
 out:
index a65bd92..d5d7c40 100644 (file)
@@ -56,7 +56,7 @@ config XEN_MEMORY_HOTPLUG_LIMIT
        depends on XEN_HAVE_PVMMU
        depends on MEMORY_HOTPLUG
        help
-         Maxmium amount of memory (in GiB) that a PV guest can be
+         Maximum amount of memory (in GiB) that a PV guest can be
          expanded to when using memory hotplug.
 
          A PV guest can have more memory than this limit if is
index 40ef379..9c286b2 100644 (file)
@@ -44,9 +44,10 @@ struct gntdev_unmap_notify {
 };
 
 struct gntdev_grant_map {
+       atomic_t in_use;
        struct mmu_interval_notifier notifier;
+       bool notifier_init;
        struct list_head next;
-       struct vm_area_struct *vma;
        int index;
        int count;
        int flags;
index 84b143e..4d9a305 100644 (file)
@@ -286,6 +286,9 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
                 */
        }
 
+       if (use_ptemod && map->notifier_init)
+               mmu_interval_notifier_remove(&map->notifier);
+
        if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
                notify_remote_via_evtchn(map->notify.event);
                evtchn_put(map->notify.event);
@@ -298,7 +301,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
 static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data)
 {
        struct gntdev_grant_map *map = data;
-       unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+       unsigned int pgnr = (addr - map->pages_vm_start) >> PAGE_SHIFT;
        int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte |
                    (1 << _GNTMAP_guest_avail0);
        u64 pte_maddr;
@@ -367,8 +370,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map)
        for (i = 0; i < map->count; i++) {
                if (map->map_ops[i].status == GNTST_okay) {
                        map->unmap_ops[i].handle = map->map_ops[i].handle;
-                       if (!use_ptemod)
-                               alloced++;
+                       alloced++;
                } else if (!err)
                        err = -EINVAL;
 
@@ -377,8 +379,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map)
 
                if (use_ptemod) {
                        if (map->kmap_ops[i].status == GNTST_okay) {
-                               if (map->map_ops[i].status == GNTST_okay)
-                                       alloced++;
+                               alloced++;
                                map->kunmap_ops[i].handle = map->kmap_ops[i].handle;
                        } else if (!err)
                                err = -EINVAL;
@@ -394,8 +395,14 @@ static void __unmap_grant_pages_done(int result,
        unsigned int i;
        struct gntdev_grant_map *map = data->data;
        unsigned int offset = data->unmap_ops - map->unmap_ops;
+       int successful_unmaps = 0;
+       int live_grants;
 
        for (i = 0; i < data->count; i++) {
+               if (map->unmap_ops[offset + i].status == GNTST_okay &&
+                   map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE)
+                       successful_unmaps++;
+
                WARN_ON(map->unmap_ops[offset + i].status != GNTST_okay &&
                        map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE);
                pr_debug("unmap handle=%d st=%d\n",
@@ -403,6 +410,10 @@ static void __unmap_grant_pages_done(int result,
                        map->unmap_ops[offset+i].status);
                map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE;
                if (use_ptemod) {
+                       if (map->kunmap_ops[offset + i].status == GNTST_okay &&
+                           map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE)
+                               successful_unmaps++;
+
                        WARN_ON(map->kunmap_ops[offset + i].status != GNTST_okay &&
                                map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE);
                        pr_debug("kunmap handle=%u st=%d\n",
@@ -411,11 +422,15 @@ static void __unmap_grant_pages_done(int result,
                        map->kunmap_ops[offset+i].handle = INVALID_GRANT_HANDLE;
                }
        }
+
        /*
         * Decrease the live-grant counter.  This must happen after the loop to
         * prevent premature reuse of the grants by gnttab_mmap().
         */
-       atomic_sub(data->count, &map->live_grants);
+       live_grants = atomic_sub_return(successful_unmaps, &map->live_grants);
+       if (WARN_ON(live_grants < 0))
+               pr_err("%s: live_grants became negative (%d) after unmapping %d pages!\n",
+                      __func__, live_grants, successful_unmaps);
 
        /* Release reference taken by __unmap_grant_pages */
        gntdev_put_map(NULL, map);
@@ -496,11 +511,7 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
        struct gntdev_priv *priv = file->private_data;
 
        pr_debug("gntdev_vma_close %p\n", vma);
-       if (use_ptemod) {
-               WARN_ON(map->vma != vma);
-               mmu_interval_notifier_remove(&map->notifier);
-               map->vma = NULL;
-       }
+
        vma->vm_private_data = NULL;
        gntdev_put_map(priv, map);
 }
@@ -528,29 +539,30 @@ static bool gntdev_invalidate(struct mmu_interval_notifier *mn,
        struct gntdev_grant_map *map =
                container_of(mn, struct gntdev_grant_map, notifier);
        unsigned long mstart, mend;
+       unsigned long map_start, map_end;
 
        if (!mmu_notifier_range_blockable(range))
                return false;
 
+       map_start = map->pages_vm_start;
+       map_end = map->pages_vm_start + (map->count << PAGE_SHIFT);
+
        /*
         * If the VMA is split or otherwise changed the notifier is not
         * updated, but we don't want to process VA's outside the modified
         * VMA. FIXME: It would be much more understandable to just prevent
         * modifying the VMA in the first place.
         */
-       if (map->vma->vm_start >= range->end ||
-           map->vma->vm_end <= range->start)
+       if (map_start >= range->end || map_end <= range->start)
                return true;
 
-       mstart = max(range->start, map->vma->vm_start);
-       mend = min(range->end, map->vma->vm_end);
+       mstart = max(range->start, map_start);
+       mend = min(range->end, map_end);
        pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
-                       map->index, map->count,
-                       map->vma->vm_start, map->vma->vm_end,
-                       range->start, range->end, mstart, mend);
-       unmap_grant_pages(map,
-                               (mstart - map->vma->vm_start) >> PAGE_SHIFT,
-                               (mend - mstart) >> PAGE_SHIFT);
+                map->index, map->count, map_start, map_end,
+                range->start, range->end, mstart, mend);
+       unmap_grant_pages(map, (mstart - map_start) >> PAGE_SHIFT,
+                         (mend - mstart) >> PAGE_SHIFT);
 
        return true;
 }
@@ -1030,18 +1042,15 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
                return -EINVAL;
 
        pr_debug("map %d+%d at %lx (pgoff %lx)\n",
-                       index, count, vma->vm_start, vma->vm_pgoff);
+                index, count, vma->vm_start, vma->vm_pgoff);
 
        mutex_lock(&priv->lock);
        map = gntdev_find_map_index(priv, index, count);
        if (!map)
                goto unlock_out;
-       if (use_ptemod && map->vma)
+       if (!atomic_add_unless(&map->in_use, 1, 1))
                goto unlock_out;
-       if (atomic_read(&map->live_grants)) {
-               err = -EAGAIN;
-               goto unlock_out;
-       }
+
        refcount_inc(&map->users);
 
        vma->vm_ops = &gntdev_vmops;
@@ -1062,15 +1071,16 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
                        map->flags |= GNTMAP_readonly;
        }
 
+       map->pages_vm_start = vma->vm_start;
+
        if (use_ptemod) {
-               map->vma = vma;
                err = mmu_interval_notifier_insert_locked(
                        &map->notifier, vma->vm_mm, vma->vm_start,
                        vma->vm_end - vma->vm_start, &gntdev_mmu_ops);
-               if (err) {
-                       map->vma = NULL;
+               if (err)
                        goto out_unlock_put;
-               }
+
+               map->notifier_init = true;
        }
        mutex_unlock(&priv->lock);
 
@@ -1087,7 +1097,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
                 */
                mmu_interval_read_begin(&map->notifier);
 
-               map->pages_vm_start = vma->vm_start;
                err = apply_to_page_range(vma->vm_mm, vma->vm_start,
                                          vma->vm_end - vma->vm_start,
                                          find_grant_ptes, map);
@@ -1116,13 +1125,8 @@ unlock_out:
 out_unlock_put:
        mutex_unlock(&priv->lock);
 out_put_map:
-       if (use_ptemod) {
+       if (use_ptemod)
                unmap_grant_pages(map, 0, map->count);
-               if (map->vma) {
-                       mmu_interval_notifier_remove(&map->notifier);
-                       map->vma = NULL;
-               }
-       }
        gntdev_put_map(priv, map);
        return err;
 }
index 8973fc1..860f37c 100644 (file)
@@ -25,7 +25,7 @@ struct xen_grant_dma_data {
        bool broken;
 };
 
-static DEFINE_XARRAY(xen_grant_dma_devices);
+static DEFINE_XARRAY_FLAGS(xen_grant_dma_devices, XA_FLAGS_LOCK_IRQ);
 
 #define XEN_GRANT_DMA_ADDR_OFF (1ULL << 63)
 
@@ -42,14 +42,29 @@ static inline grant_ref_t dma_to_grant(dma_addr_t dma)
 static struct xen_grant_dma_data *find_xen_grant_dma_data(struct device *dev)
 {
        struct xen_grant_dma_data *data;
+       unsigned long flags;
 
-       xa_lock(&xen_grant_dma_devices);
+       xa_lock_irqsave(&xen_grant_dma_devices, flags);
        data = xa_load(&xen_grant_dma_devices, (unsigned long)dev);
-       xa_unlock(&xen_grant_dma_devices);
+       xa_unlock_irqrestore(&xen_grant_dma_devices, flags);
 
        return data;
 }
 
+static int store_xen_grant_dma_data(struct device *dev,
+                                   struct xen_grant_dma_data *data)
+{
+       unsigned long flags;
+       int ret;
+
+       xa_lock_irqsave(&xen_grant_dma_devices, flags);
+       ret = xa_err(__xa_store(&xen_grant_dma_devices, (unsigned long)dev, data,
+                       GFP_ATOMIC));
+       xa_unlock_irqrestore(&xen_grant_dma_devices, flags);
+
+       return ret;
+}
+
 /*
  * DMA ops for Xen frontends (e.g. virtio).
  *
@@ -153,7 +168,7 @@ static dma_addr_t xen_grant_dma_map_page(struct device *dev, struct page *page,
                                         unsigned long attrs)
 {
        struct xen_grant_dma_data *data;
-       unsigned int i, n_pages = PFN_UP(size);
+       unsigned int i, n_pages = PFN_UP(offset + size);
        grant_ref_t grant;
        dma_addr_t dma_handle;
 
@@ -185,7 +200,8 @@ static void xen_grant_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
                                     unsigned long attrs)
 {
        struct xen_grant_dma_data *data;
-       unsigned int i, n_pages = PFN_UP(size);
+       unsigned long offset = dma_handle & (PAGE_SIZE - 1);
+       unsigned int i, n_pages = PFN_UP(offset + size);
        grant_ref_t grant;
 
        if (WARN_ON(dir == DMA_NONE))
@@ -273,72 +289,91 @@ static const struct dma_map_ops xen_grant_dma_ops = {
        .dma_supported = xen_grant_dma_supported,
 };
 
-bool xen_is_grant_dma_device(struct device *dev)
+static bool xen_is_dt_grant_dma_device(struct device *dev)
 {
        struct device_node *iommu_np;
        bool has_iommu;
 
-       /* XXX Handle only DT devices for now */
-       if (!dev->of_node)
-               return false;
-
        iommu_np = of_parse_phandle(dev->of_node, "iommus", 0);
-       has_iommu = iommu_np && of_device_is_compatible(iommu_np, "xen,grant-dma");
+       has_iommu = iommu_np &&
+                   of_device_is_compatible(iommu_np, "xen,grant-dma");
        of_node_put(iommu_np);
 
        return has_iommu;
 }
 
+bool xen_is_grant_dma_device(struct device *dev)
+{
+       /* XXX Handle only DT devices for now */
+       if (dev->of_node)
+               return xen_is_dt_grant_dma_device(dev);
+
+       return false;
+}
+
 bool xen_virtio_mem_acc(struct virtio_device *dev)
 {
-       if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT))
+       if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT) || xen_pv_domain())
                return true;
 
        return xen_is_grant_dma_device(dev->dev.parent);
 }
 
-void xen_grant_setup_dma_ops(struct device *dev)
+static int xen_dt_grant_init_backend_domid(struct device *dev,
+                                          struct xen_grant_dma_data *data)
 {
-       struct xen_grant_dma_data *data;
        struct of_phandle_args iommu_spec;
 
-       data = find_xen_grant_dma_data(dev);
-       if (data) {
-               dev_err(dev, "Xen grant DMA data is already created\n");
-               return;
-       }
-
-       /* XXX ACPI device unsupported for now */
-       if (!dev->of_node)
-               goto err;
-
        if (of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells",
                        0, &iommu_spec)) {
                dev_err(dev, "Cannot parse iommus property\n");
-               goto err;
+               return -ESRCH;
        }
 
        if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") ||
                        iommu_spec.args_count != 1) {
                dev_err(dev, "Incompatible IOMMU node\n");
                of_node_put(iommu_spec.np);
-               goto err;
+               return -ESRCH;
        }
 
        of_node_put(iommu_spec.np);
 
+       /*
+        * The endpoint ID here means the ID of the domain where the
+        * corresponding backend is running
+        */
+       data->backend_domid = iommu_spec.args[0];
+
+       return 0;
+}
+
+void xen_grant_setup_dma_ops(struct device *dev)
+{
+       struct xen_grant_dma_data *data;
+
+       data = find_xen_grant_dma_data(dev);
+       if (data) {
+               dev_err(dev, "Xen grant DMA data is already created\n");
+               return;
+       }
+
        data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
        if (!data)
                goto err;
 
-       /*
-        * The endpoint ID here means the ID of the domain where the corresponding
-        * backend is running
-        */
-       data->backend_domid = iommu_spec.args[0];
+       if (dev->of_node) {
+               if (xen_dt_grant_init_backend_domid(dev, data))
+                       goto err;
+       } else if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) {
+               dev_info(dev, "Using dom0 as backend\n");
+               data->backend_domid = 0;
+       } else {
+               /* XXX ACPI device unsupported for now */
+               goto err;
+       }
 
-       if (xa_err(xa_store(&xen_grant_dma_devices, (unsigned long)dev, data,
-                       GFP_KERNEL))) {
+       if (store_xen_grant_dma_data(dev, data)) {
                dev_err(dev, "Cannot store Xen grant DMA data\n");
                goto err;
        }
@@ -348,9 +383,20 @@ void xen_grant_setup_dma_ops(struct device *dev)
        return;
 
 err:
+       devm_kfree(dev, data);
        dev_err(dev, "Cannot set up Xen grant DMA ops, retain platform DMA ops\n");
 }
 
+bool xen_virtio_restricted_mem_acc(struct virtio_device *dev)
+{
+       bool ret = xen_virtio_mem_acc(dev);
+
+       if (ret)
+               xen_grant_setup_dma_ops(dev->dev.parent);
+
+       return ret;
+}
+
 MODULE_DESCRIPTION("Xen grant DMA-mapping layer");
 MODULE_AUTHOR("Juergen Gross <jgross@suse.com>");
 MODULE_LICENSE("GPL");
index bde63ef..d171091 100644 (file)
@@ -31,7 +31,7 @@ MODULE_PARM_DESC(passthrough,
        "   frontend (for example, a device at 06:01.b will still appear at\n"\
        "   06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\
        "   exposed PCI devices to its driver domains. This may be required\n"\
-       "   for drivers which depend on finding their hardward in certain\n"\
+       "   for drivers which depend on finding their hardware in certain\n"\
        "   bus/slot locations.");
 
 static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
index dae0f35..a34f427 100644 (file)
@@ -219,6 +219,7 @@ static inline void xen_preemptible_hcall_end(void) { }
 void xen_grant_setup_dma_ops(struct device *dev);
 bool xen_is_grant_dma_device(struct device *dev);
 bool xen_virtio_mem_acc(struct virtio_device *dev);
+bool xen_virtio_restricted_mem_acc(struct virtio_device *dev);
 #else
 static inline void xen_grant_setup_dma_ops(struct device *dev)
 {
@@ -234,6 +235,11 @@ static inline bool xen_virtio_mem_acc(struct virtio_device *dev)
 {
        return false;
 }
+
+static inline bool xen_virtio_restricted_mem_acc(struct virtio_device *dev)
+{
+       return false;
+}
 #endif /* CONFIG_XEN_GRANT_DMA_OPS */
 
 #endif /* INCLUDE_XEN_OPS_H */