Re: [PATCH v1 7/9] lib: add support for device coherent type in test_hmm
From: Felix Kuehling <felix.kuehling@amd.com>
Date: 2021-11-25 02:13:25
Also in:
amd-gfx, dri-devel, linux-ext4, linux-xfs
Am 2021-11-15 um 2:30 p.m. schrieb Alex Sierra:
quoted hunk ↗ jump to hunk
Device Coherent type uses device memory that is coherently accesible by the CPU. This could be shown as SP (special purpose) memory range at the BIOS-e820 memory enumeration. If no SP memory is supported in system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP. Currently, test_hmm only supports two different SP ranges of at least 256MB size. This could be specified in the kernel parameter variable efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 & 0x140000000 physical address. Ex. efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000 Signed-off-by: Alex Sierra <redacted> --- lib/test_hmm.c | 191 +++++++++++++++++++++++++++++++++----------- lib/test_hmm_uapi.h | 15 ++-- 2 files changed, 153 insertions(+), 53 deletions(-)diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 45334df28d7b..9e2cc0cd4412 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c@@ -471,6 +471,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, unsigned long pfn_first; unsigned long pfn_last; void *ptr; + int ret = -ENOMEM; devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); if (!devmem)@@ -553,7 +554,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, } spin_unlock(&mdevice->lock); - return true; + return 0;
You're changing the meaning of the return value, but you're not updating the callers. I think at least dmirror_devmem_alloc_page will be broken by this change.
quoted hunk ↗ jump to hunk
err_release: mutex_unlock(&mdevice->devmem_lock);@@ -562,7 +563,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, err_devmem: kfree(devmem); - return false; + return ret; } static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)@@ -571,8 +572,10 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) struct page *rpage; /* - * This is a fake device so we alloc real system memory to store - * our device memory. + * For ZONE_DEVICE private type, this is a fake device so we alloc real + * system memory to store our device memory. + * For ZONE_DEVICE coherent type we use the actual dpage to store the data + * and ignore rpage. */ rpage = alloc_page(GFP_HIGHUSER); if (!rpage)@@ -623,12 +626,17 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, * unallocated pte_none() or read-only zero page. */ spage = migrate_pfn_to_page(*src); + if (spage && is_zone_device_page(spage)) + pr_err("page already in device spage pfn: 0x%lx\n", + page_to_pfn(spage)); + WARN_ON(spage && is_zone_device_page(spage));
You can write WARN_ON inside the if-condition: if (WARN_ON(spage && is_zone_device_page(spage)) ... But I don't see why you need both pr_err and WARN_ON. You can add a custom message by using WARN instead of WARN_ON: WARN(spage && is_zone_device_page(spage), "page already in device spage pfn: 0x%lx\n", page_to_pfn(spage));
quoted hunk ↗ jump to hunk
dpage = dmirror_devmem_alloc_page(mdevice); if (!dpage) continue; - rpage = dpage->zone_device_data; + rpage = is_device_private_page(dpage) ? dpage->zone_device_data : + dpage; if (spage) copy_highpage(rpage, spage); else@@ -640,8 +648,11 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, * the simulated device memory and that page holds the pointer * to the mirror. */ + rpage = dpage->zone_device_data; rpage->zone_device_data = dmirror; + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; if ((*src & MIGRATE_PFN_WRITE) ||@@ -721,10 +732,13 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, continue; /* - * Store the page that holds the data so the page table - * doesn't have to deal with ZONE_DEVICE private pages. + * For ZONE_DEVICE private pages we store the page that + * holds the data so the page table doesn't have to deal it. + * For ZONE_DEVICE coherent pages we store the actual page, since + * the CPU has coherent access to the page. */
I find this explanation confusing. The comment in dmirror_devmem_alloc_page is much clearer. I think all we need here is that dpage->zone_device_data points to the backing page for device_private pages. Device_coherent struct pages don't have a backing page because they represent a real CPU-accessible page already.
- entry = dpage->zone_device_data; + entry = is_device_private_page(dpage) ? dpage->zone_device_data : + dpage;
I see this in a few places. Maybe better to wrap that in a helper function or macro. Something like this: #define BACKING_PAGE(page) (is_device_private_page((page)) ? \ (page)->zone_device_data : (page))
quoted hunk ↗ jump to hunk
if (*dst & MIGRATE_PFN_WRITE) entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);@@ -804,8 +818,110 @@ static int dmirror_exclusive(struct dmirror *dmirror, return ret; } -static int dmirror_migrate(struct dmirror *dmirror, - struct hmm_dmirror_cmd *cmd) +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, + struct dmirror *dmirror) +{ + const unsigned long *src = args->src; + unsigned long *dst = args->dst; + unsigned long start = args->start; + unsigned long end = args->end; + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE, + src++, dst++) { + struct page *dpage, *spage; + + spage = migrate_pfn_to_page(*src); + if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) + continue; + + WARN_ON(!is_device_page(spage)); + spage = is_device_private_page(spage) ? spage->zone_device_data : + spage; + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + if (!dpage) + continue; + pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); + + lock_page(dpage); + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); + copy_highpage(dpage, spage); + *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; + if (*src & MIGRATE_PFN_WRITE) + *dst |= MIGRATE_PFN_WRITE; + } + return 0; +} + +static int dmirror_migrate_to_system(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) +{ + unsigned long start, end, addr; + unsigned long size = cmd->npages << PAGE_SHIFT; + struct mm_struct *mm = dmirror->notifier.mm; + struct vm_area_struct *vma; + unsigned long src_pfns[64]; + unsigned long dst_pfns[64]; + struct migrate_vma args; + unsigned long next; + int ret; + + start = cmd->addr; + end = start + size; + if (end < start) + return -EINVAL; + + /* Since the mm is for the mirrored process, get a reference first. */ + if (!mmget_not_zero(mm)) + return -EINVAL; + + mmap_read_lock(mm); + for (addr = start; addr < end; addr = next) { + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || + !(vma->vm_flags & VM_READ)) { + ret = -EINVAL; + goto out; + } + next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); + if (next > vma->vm_end) + next = vma->vm_end; + + args.vma = vma; + args.src = src_pfns; + args.dst = dst_pfns; + args.start = addr; + args.end = next; + args.pgmap_owner = dmirror->mdevice; + args.flags = (dmirror->mdevice->zone_device_type == + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? + MIGRATE_VMA_SELECT_DEVICE_PRIVATE : + MIGRATE_VMA_SELECT_DEVICE_COHERENT; + + ret = migrate_vma_setup(&args); + if (ret) + goto out; + + pr_debug("Migrating from device mem to sys mem\n"); + dmirror_devmem_fault_alloc_and_copy(&args, dmirror); + + migrate_vma_pages(&args); + migrate_vma_finalize(&args); + } + mmap_read_unlock(mm); + mmput(mm); + + return ret; + +out: + mmap_read_unlock(mm); + mmput(mm); + return ret; +} + +static int dmirror_migrate_to_device(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) { unsigned long start, end, addr; unsigned long size = cmd->npages << PAGE_SHIFT;@@ -849,6 +965,7 @@ static int dmirror_migrate(struct dmirror *dmirror, if (ret) goto out; + pr_debug("Migrating from sys mem to device mem\n"); dmirror_migrate_alloc_and_copy(&args, dmirror); migrate_vma_pages(&args); dmirror_migrate_finalize_and_map(&args, dmirror);@@ -857,7 +974,7 @@ static int dmirror_migrate(struct dmirror *dmirror, mmap_read_unlock(mm); mmput(mm); - /* Return the migrated data for verification. */ + /* Return the migrated data for verification. only for pages in device zone */ ret = dmirror_bounce_init(&bounce, start, size); if (ret) return ret;@@ -894,9 +1011,15 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, } page = hmm_pfn_to_page(entry); - if (is_device_private_page(page)) { - /* Is the page migrated to this device or some other? */ - if (dmirror->mdevice == dmirror_page_to_device(page)) + if (is_device_page(page)) { + /* Is page ZONE_DEVICE coherent? */ + if (!is_device_private_page(page)) + *perm = HMM_DMIRROR_PROT_DEV_COHERENT;
Does it make sense to distinguish between DEV_COHERENT_LOCAL and DEV_COHERENT_REMOTE as well?
quoted hunk ↗ jump to hunk
+ /* + * Is page ZONE_DEVICE private migrated to + * this device or some other? + */ + else if (dmirror->mdevice == dmirror_page_to_device(page)) *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; else *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;@@ -1096,8 +1219,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, ret = dmirror_write(dmirror, &cmd); break; - case HMM_DMIRROR_MIGRATE: - ret = dmirror_migrate(dmirror, &cmd); + case HMM_DMIRROR_MIGRATE_TO_DEV: + ret = dmirror_migrate_to_device(dmirror, &cmd); + break; + + case HMM_DMIRROR_MIGRATE_TO_SYS: + ret = dmirror_migrate_to_system(dmirror, &cmd); break; case HMM_DMIRROR_EXCLUSIVE:@@ -1152,38 +1279,6 @@ static void dmirror_devmem_free(struct page *page) spin_unlock(&mdevice->lock); } -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, - struct dmirror *dmirror) -{ - const unsigned long *src = args->src; - unsigned long *dst = args->dst; - unsigned long start = args->start; - unsigned long end = args->end; - unsigned long addr; - - for (addr = start; addr < end; addr += PAGE_SIZE, - src++, dst++) { - struct page *dpage, *spage; - - spage = migrate_pfn_to_page(*src); - if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) - continue; - spage = spage->zone_device_data; - - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); - if (!dpage) - continue; - - lock_page(dpage); - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); - copy_highpage(dpage, spage); - *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; - if (*src & MIGRATE_PFN_WRITE) - *dst |= MIGRATE_PFN_WRITE; - } - return 0; -} - static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { struct migrate_vma args;diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index 77f81e6314eb..af15c35a90f4 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h@@ -19,6 +19,7 @@ * @npages: (in) number of pages to read/write * @cpages: (out) number of pages copied * @faults: (out) number of device page faults seen + * @zone_device_type: (out) zone device memory type */ struct hmm_dmirror_cmd { __u64 addr;@@ -32,11 +33,12 @@ struct hmm_dmirror_cmd { /* Expose the address space of the calling process through hmm device file */ #define HMM_DMIRROR_READ _IOWR('H', 0x00, struct hmm_dmirror_cmd) #define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x06, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x07, struct hmm_dmirror_cmd) /* * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.@@ -51,6 +53,8 @@ struct hmm_dmirror_cmd { * device the ioctl() is made * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some * other device + * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device + * the ioctl() is made */ enum { HMM_DMIRROR_PROT_ERROR = 0xFF,@@ -62,6 +66,7 @@ enum { HMM_DMIRROR_PROT_ZERO = 0x10, HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20, HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30, + HMM_DMIRROR_PROT_DEV_COHERENT = 0x40,
Does it make sense to distinguish between DEV_COHERENT_LOCAL and DEV_COHERENT_REMOTE as well? Regards, Felix
};
enum {