lib: add support for device coherent type in test_hmm
authorAlex Sierra <alex.sierra@amd.com>
Fri, 15 Jul 2022 15:05:17 +0000 (10:05 -0500)
committerakpm <akpm@linux-foundation.org>
Mon, 18 Jul 2022 00:14:29 +0000 (17:14 -0700)
Device Coherent type uses device memory that is coherently accesible by
the CPU.  This could be shown as SP (special purpose) memory range at the
BIOS-e820 memory enumeration.  If no SP memory is supported in system,
this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.

Currently, test_hmm only supports two different SP ranges of at least
256MB size. This could be specified in the kernel parameter variable
efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 &
0x140000000 physical address. Ex.
efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000

Private and coherent device mirror instances can be created in the same
probed.  This is done by passing the module parameters spm_addr_dev0 &
spm_addr_dev1.  In this case, it will create four instances of
device_mirror.  The first two correspond to private device type, the last
two to coherent type.  Then, they can be easily accessed from user space
through /dev/hmm_mirror<num_device>.  Usually num_device 0 and 1 are for
private, and 2 and 3 for coherent types.  If no module parameters are
passed, two instances of private type device_mirror will be created only.

Link: https://lkml.kernel.org/r/20220715150521.18165-11-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Alistair Poppple <apopple@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
lib/test_hmm.c
lib/test_hmm_uapi.h

index 436124d..e3965ca 100644 (file)
 
 #include "test_hmm_uapi.h"
 
-#define DMIRROR_NDEVICES               2
+#define DMIRROR_NDEVICES               4
 #define DMIRROR_RANGE_FAULT_TIMEOUT    1000
 #define DEVMEM_CHUNK_SIZE              (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE          16
 
+/*
+ * For device_private pages, dpage is just a dummy struct page
+ * representing a piece of device memory. dmirror_devmem_alloc_page
+ * allocates a real system memory page as backing storage to fake a
+ * real device. zone_device_data points to that backing page. But
+ * for device_coherent memory, the struct page represents real
+ * physical CPU-accessible memory that we can use directly.
+ */
+#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
+                          (page)->zone_device_data : (page))
+
 static unsigned long spm_addr_dev0;
 module_param(spm_addr_dev0, long, 0644);
 MODULE_PARM_DESC(spm_addr_dev0,
@@ -125,6 +136,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce,
        return 0;
 }
 
+static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
+{
+       return (mdevice->zone_device_type ==
+               HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
+}
+
+static enum migrate_vma_direction
+dmirror_select_device(struct dmirror *dmirror)
+{
+       return (dmirror->mdevice->zone_device_type ==
+               HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
+               MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
+               MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+}
+
 static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
 {
        vfree(bounce->ptr);
@@ -575,16 +601,19 @@ err_devmem:
 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
 {
        struct page *dpage = NULL;
-       struct page *rpage;
+       struct page *rpage = NULL;
 
        /*
-        * This is a fake device so we alloc real system memory to store
-        * our device memory.
+        * For ZONE_DEVICE private type, this is a fake device so we allocate
+        * real system memory to store our device memory.
+        * For ZONE_DEVICE coherent type we use the actual dpage to store the
+        * data and ignore rpage.
         */
-       rpage = alloc_page(GFP_HIGHUSER);
-       if (!rpage)
-               return NULL;
-
+       if (dmirror_is_private_zone(mdevice)) {
+               rpage = alloc_page(GFP_HIGHUSER);
+               if (!rpage)
+                       return NULL;
+       }
        spin_lock(&mdevice->lock);
 
        if (mdevice->free_pages) {
@@ -603,7 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
        return dpage;
 
 error:
-       __free_page(rpage);
+       if (rpage)
+               __free_page(rpage);
        return NULL;
 }
 
@@ -629,12 +659,16 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
                 * unallocated pte_none() or read-only zero page.
                 */
                spage = migrate_pfn_to_page(*src);
+               if (WARN(spage && is_zone_device_page(spage),
+                    "page already in device spage pfn: 0x%lx\n",
+                    page_to_pfn(spage)))
+                       continue;
 
                dpage = dmirror_devmem_alloc_page(mdevice);
                if (!dpage)
                        continue;
 
-               rpage = dpage->zone_device_data;
+               rpage = BACKING_PAGE(dpage);
                if (spage)
                        copy_highpage(rpage, spage);
                else
@@ -648,6 +682,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
                 */
                rpage->zone_device_data = dmirror;
 
+               pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
+                        page_to_pfn(spage), page_to_pfn(dpage));
                *dst = migrate_pfn(page_to_pfn(dpage));
                if ((*src & MIGRATE_PFN_WRITE) ||
                    (!spage && args->vma->vm_flags & VM_WRITE))
@@ -725,11 +761,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
                if (!dpage)
                        continue;
 
-               /*
-                * Store the page that holds the data so the page table
-                * doesn't have to deal with ZONE_DEVICE private pages.
-                */
-               entry = dpage->zone_device_data;
+               entry = BACKING_PAGE(dpage);
                if (*dst & MIGRATE_PFN_WRITE)
                        entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
                entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
@@ -815,15 +847,126 @@ static int dmirror_exclusive(struct dmirror *dmirror,
        return ret;
 }
 
-static int dmirror_migrate(struct dmirror *dmirror,
-                          struct hmm_dmirror_cmd *cmd)
+static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
+                                                     struct dmirror *dmirror)
+{
+       const unsigned long *src = args->src;
+       unsigned long *dst = args->dst;
+       unsigned long start = args->start;
+       unsigned long end = args->end;
+       unsigned long addr;
+
+       for (addr = start; addr < end; addr += PAGE_SIZE,
+                                      src++, dst++) {
+               struct page *dpage, *spage;
+
+               spage = migrate_pfn_to_page(*src);
+               if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
+                       continue;
+
+               if (WARN_ON(!is_device_private_page(spage) &&
+                           !is_device_coherent_page(spage)))
+                       continue;
+               spage = BACKING_PAGE(spage);
+               dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
+               if (!dpage)
+                       continue;
+               pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
+                        page_to_pfn(spage), page_to_pfn(dpage));
+
+               lock_page(dpage);
+               xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
+               copy_highpage(dpage, spage);
+               *dst = migrate_pfn(page_to_pfn(dpage));
+               if (*src & MIGRATE_PFN_WRITE)
+                       *dst |= MIGRATE_PFN_WRITE;
+       }
+       return 0;
+}
+
+static unsigned long
+dmirror_successful_migrated_pages(struct migrate_vma *migrate)
+{
+       unsigned long cpages = 0;
+       unsigned long i;
+
+       for (i = 0; i < migrate->npages; i++) {
+               if (migrate->src[i] & MIGRATE_PFN_VALID &&
+                   migrate->src[i] & MIGRATE_PFN_MIGRATE)
+                       cpages++;
+       }
+       return cpages;
+}
+
+static int dmirror_migrate_to_system(struct dmirror *dmirror,
+                                    struct hmm_dmirror_cmd *cmd)
 {
        unsigned long start, end, addr;
        unsigned long size = cmd->npages << PAGE_SHIFT;
        struct mm_struct *mm = dmirror->notifier.mm;
        struct vm_area_struct *vma;
-       unsigned long src_pfns[64];
-       unsigned long dst_pfns[64];
+       unsigned long src_pfns[64] = { 0 };
+       unsigned long dst_pfns[64] = { 0 };
+       struct migrate_vma args;
+       unsigned long next;
+       int ret;
+
+       start = cmd->addr;
+       end = start + size;
+       if (end < start)
+               return -EINVAL;
+
+       /* Since the mm is for the mirrored process, get a reference first. */
+       if (!mmget_not_zero(mm))
+               return -EINVAL;
+
+       cmd->cpages = 0;
+       mmap_read_lock(mm);
+       for (addr = start; addr < end; addr = next) {
+               vma = vma_lookup(mm, addr);
+               if (!vma || !(vma->vm_flags & VM_READ)) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
+               if (next > vma->vm_end)
+                       next = vma->vm_end;
+
+               args.vma = vma;
+               args.src = src_pfns;
+               args.dst = dst_pfns;
+               args.start = addr;
+               args.end = next;
+               args.pgmap_owner = dmirror->mdevice;
+               args.flags = dmirror_select_device(dmirror);
+
+               ret = migrate_vma_setup(&args);
+               if (ret)
+                       goto out;
+
+               pr_debug("Migrating from device mem to sys mem\n");
+               dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
+
+               migrate_vma_pages(&args);
+               cmd->cpages += dmirror_successful_migrated_pages(&args);
+               migrate_vma_finalize(&args);
+       }
+out:
+       mmap_read_unlock(mm);
+       mmput(mm);
+
+       return ret;
+}
+
+static int dmirror_migrate_to_device(struct dmirror *dmirror,
+                               struct hmm_dmirror_cmd *cmd)
+{
+       unsigned long start, end, addr;
+       unsigned long size = cmd->npages << PAGE_SHIFT;
+       struct mm_struct *mm = dmirror->notifier.mm;
+       struct vm_area_struct *vma;
+       unsigned long src_pfns[64] = { 0 };
+       unsigned long dst_pfns[64] = { 0 };
        struct dmirror_bounce bounce;
        struct migrate_vma args;
        unsigned long next;
@@ -860,6 +1003,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
                if (ret)
                        goto out;
 
+               pr_debug("Migrating from sys mem to device mem\n");
                dmirror_migrate_alloc_and_copy(&args, dmirror);
                migrate_vma_pages(&args);
                dmirror_migrate_finalize_and_map(&args, dmirror);
@@ -868,7 +1012,10 @@ static int dmirror_migrate(struct dmirror *dmirror,
        mmap_read_unlock(mm);
        mmput(mm);
 
-       /* Return the migrated data for verification. */
+       /*
+        * Return the migrated data for verification.
+        * Only for pages in device zone
+        */
        ret = dmirror_bounce_init(&bounce, start, size);
        if (ret)
                return ret;
@@ -911,6 +1058,12 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
                        *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
                else
                        *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
+       } else if (is_device_coherent_page(page)) {
+               /* Is the page migrated to this device or some other? */
+               if (dmirror->mdevice == dmirror_page_to_device(page))
+                       *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL;
+               else
+                       *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE;
        } else if (is_zero_pfn(page_to_pfn(page)))
                *perm = HMM_DMIRROR_PROT_ZERO;
        else
@@ -1098,8 +1251,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
                ret = dmirror_write(dmirror, &cmd);
                break;
 
-       case HMM_DMIRROR_MIGRATE:
-               ret = dmirror_migrate(dmirror, &cmd);
+       case HMM_DMIRROR_MIGRATE_TO_DEV:
+               ret = dmirror_migrate_to_device(dmirror, &cmd);
+               break;
+
+       case HMM_DMIRROR_MIGRATE_TO_SYS:
+               ret = dmirror_migrate_to_system(dmirror, &cmd);
                break;
 
        case HMM_DMIRROR_EXCLUSIVE:
@@ -1161,14 +1318,13 @@ static const struct file_operations dmirror_fops = {
 
 static void dmirror_devmem_free(struct page *page)
 {
-       struct page *rpage = page->zone_device_data;
+       struct page *rpage = BACKING_PAGE(page);
        struct dmirror_device *mdevice;
 
-       if (rpage)
+       if (rpage != page)
                __free_page(rpage);
 
        mdevice = dmirror_page_to_device(page);
-
        spin_lock(&mdevice->lock);
        mdevice->cfree++;
        page->zone_device_data = mdevice->free_pages;
@@ -1176,43 +1332,11 @@ static void dmirror_devmem_free(struct page *page)
        spin_unlock(&mdevice->lock);
 }
 
-static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
-                                                     struct dmirror *dmirror)
-{
-       const unsigned long *src = args->src;
-       unsigned long *dst = args->dst;
-       unsigned long start = args->start;
-       unsigned long end = args->end;
-       unsigned long addr;
-
-       for (addr = start; addr < end; addr += PAGE_SIZE,
-                                      src++, dst++) {
-               struct page *dpage, *spage;
-
-               spage = migrate_pfn_to_page(*src);
-               if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
-                       continue;
-               spage = spage->zone_device_data;
-
-               dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
-               if (!dpage)
-                       continue;
-
-               lock_page(dpage);
-               xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
-               copy_highpage(dpage, spage);
-               *dst = migrate_pfn(page_to_pfn(dpage));
-               if (*src & MIGRATE_PFN_WRITE)
-                       *dst |= MIGRATE_PFN_WRITE;
-       }
-       return 0;
-}
-
 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
 {
        struct migrate_vma args;
-       unsigned long src_pfns;
-       unsigned long dst_pfns;
+       unsigned long src_pfns = 0;
+       unsigned long dst_pfns = 0;
        struct page *rpage;
        struct dmirror *dmirror;
        vm_fault_t ret;
@@ -1232,7 +1356,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
        args.src = &src_pfns;
        args.dst = &dst_pfns;
        args.pgmap_owner = dmirror->mdevice;
-       args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
+       args.flags = dmirror_select_device(dmirror);
 
        if (migrate_vma_setup(&args))
                return VM_FAULT_SIGBUS;
@@ -1311,6 +1435,12 @@ static int __init hmm_dmirror_init(void)
                                HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
        dmirror_devices[ndevices++].zone_device_type =
                                HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+       if (spm_addr_dev0 && spm_addr_dev1) {
+               dmirror_devices[ndevices++].zone_device_type =
+                                       HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
+               dmirror_devices[ndevices++].zone_device_type =
+                                       HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
+       }
        for (id = 0; id < ndevices; id++) {
                ret = dmirror_device_init(dmirror_devices + id, id);
                if (ret)
@@ -1333,7 +1463,8 @@ static void __exit hmm_dmirror_exit(void)
        int id;
 
        for (id = 0; id < DMIRROR_NDEVICES; id++)
-               dmirror_device_remove(dmirror_devices + id);
+               if (dmirror_devices[id].zone_device_type)
+                       dmirror_device_remove(dmirror_devices + id);
        unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
 }
 
index f700da7..e31d58c 100644 (file)
@@ -50,6 +50,8 @@ struct hmm_dmirror_cmd {
  *                                     device the ioctl() is made
  * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some
  *                                     other device
+ * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device
+ *                               the ioctl() is made
  */
 enum {
        HMM_DMIRROR_PROT_ERROR                  = 0xFF,
@@ -61,6 +63,8 @@ enum {
        HMM_DMIRROR_PROT_ZERO                   = 0x10,
        HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL      = 0x20,
        HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE     = 0x30,
+       HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL     = 0x40,
+       HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE    = 0x50,
 };
 
 enum {