[PATCH v2 10/12] lib: add support for device public type in test_hmm
Alistair Popple
apopple at nvidia.com
Fri Oct 1 01:32:07 UTC 2021
On Tuesday, 14 September 2021 2:16:02 AM AEST Alex Sierra wrote:
> Device Public type uses device memory that is coherently accesible by
> the CPU. This could be shown as SP (special purpose) memory range
> at the BIOS-e820 memory enumeration. If no SP memory is supported in
> system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.
>
> Currently, test_hmm only supports two different SP ranges of at least
> 256MB size. This could be specified in the kernel parameter variable
> efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 &
> 0x140000000 physical address. Ex.
> efi_fake_mem=1G at 0x100000000:0x40000,1G at 0x140000000:0x40000
>
> Signed-off-by: Alex Sierra <alex.sierra at amd.com>
> ---
> lib/test_hmm.c | 166 +++++++++++++++++++++++++++-----------------
> lib/test_hmm_uapi.h | 10 ++-
> 2 files changed, 113 insertions(+), 63 deletions(-)
>
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index ef27e355738a..e346a48e2509 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -469,6 +469,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
> unsigned long pfn_first;
> unsigned long pfn_last;
> void *ptr;
> + int ret = -ENOMEM;
>
> devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
> if (!devmem)
> @@ -551,7 +552,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
> }
> spin_unlock(&mdevice->lock);
>
> - return true;
> + return 0;
>
> err_release:
> mutex_unlock(&mdevice->devmem_lock);
> @@ -560,7 +561,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
> err_devmem:
> kfree(devmem);
>
> - return false;
> + return ret;
> }
>
> static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
> @@ -569,8 +570,10 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
> struct page *rpage;
>
> /*
> - * This is a fake device so we alloc real system memory to store
> - * our device memory.
> + * For ZONE_DEVICE private type, this is a fake device so we alloc real
> + * system memory to store our device memory.
> + * For ZONE_DEVICE public type we use the actual dpage to store the data
> + * and ignore rpage.
> */
> rpage = alloc_page(GFP_HIGHUSER);
> if (!rpage)
> @@ -603,7 +606,7 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
> struct dmirror *dmirror)
> {
> struct dmirror_device *mdevice = dmirror->mdevice;
> - const unsigned long *src = args->src;
> + unsigned long *src = args->src;
> unsigned long *dst = args->dst;
> unsigned long addr;
>
> @@ -621,12 +624,18 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
> * unallocated pte_none() or read-only zero page.
> */
> spage = migrate_pfn_to_page(*src);
> -
> + if (spage && is_zone_device_page(spage)) {
> + pr_debug("page already in device spage pfn: 0x%lx\n",
> + page_to_pfn(spage));
> + *src &= ~MIGRATE_PFN_MIGRATE;
I don't think this is quite correct, callers shouldn't modify the src array. To
mark a page as not migrating callers need to set *dst = 0.
However I think this should be considered a test failure anyway. If we are
migrating from system to device memory we would have set
MIGRATE_VMA_SELECT_SYSTEM meaning no device private pages should be returned.
Therefore I don't think we can reach this unless there is a bug right?
> + continue;
> + }
> dpage = dmirror_devmem_alloc_page(mdevice);
> if (!dpage)
> continue;
>
> - rpage = dpage->zone_device_data;
> + rpage = is_device_private_page(dpage) ? dpage->zone_device_data :
> + dpage;
> if (spage)
> copy_highpage(rpage, spage);
> else
> @@ -638,8 +647,10 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
> * the simulated device memory and that page holds the pointer
> * to the mirror.
> */
> + rpage = dpage->zone_device_data;
> rpage->zone_device_data = dmirror;
> -
> + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
> + page_to_pfn(spage), page_to_pfn(dpage));
> *dst = migrate_pfn(page_to_pfn(dpage)) |
> MIGRATE_PFN_LOCKED;
> if ((*src & MIGRATE_PFN_WRITE) ||
> @@ -673,10 +684,13 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
> continue;
>
> /*
> - * Store the page that holds the data so the page table
> - * doesn't have to deal with ZONE_DEVICE private pages.
> + * For ZONE_DEVICE private pages we store the page that
> + * holds the data so the page table doesn't have to deal it.
> + * For ZONE_DEVICE public pages we store the actual page, since
> + * the CPU has coherent access to the page.
> */
> - entry = dpage->zone_device_data;
> + entry = is_device_private_page(dpage) ? dpage->zone_device_data :
> + dpage;
> if (*dst & MIGRATE_PFN_WRITE)
> entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
> entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
> @@ -690,6 +704,47 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
> return 0;
> }
>
> +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
> + struct dmirror *dmirror)
> +{
> + unsigned long *src = args->src;
> + unsigned long *dst = args->dst;
> + unsigned long start = args->start;
> + unsigned long end = args->end;
> + unsigned long addr;
> +
> + for (addr = start; addr < end; addr += PAGE_SIZE,
> + src++, dst++) {
> + struct page *dpage, *spage;
> +
> + spage = migrate_pfn_to_page(*src);
> + if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
> + continue;
> + if (is_device_private_page(spage)) {
> + spage = spage->zone_device_data;
> + } else {
> + pr_debug("page already in system or SPM spage pfn: 0x%lx\n",
> + page_to_pfn(spage));
> + *src &= ~MIGRATE_PFN_MIGRATE;
Same comment as above - shouldn't touch *src and also shouldn't be able to get
here anyway.
> + continue;
> + }
> + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
> + if (!dpage)
> + continue;
> + pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
> + page_to_pfn(spage), page_to_pfn(dpage));
> +
> + lock_page(dpage);
> + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
> + copy_highpage(dpage, spage);
> + *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> + if (*src & MIGRATE_PFN_WRITE)
> + *dst |= MIGRATE_PFN_WRITE;
> + }
> + return 0;
> +}
> +
> +
> static int dmirror_migrate(struct dmirror *dmirror,
> struct hmm_dmirror_cmd *cmd)
> {
> @@ -731,33 +786,46 @@ static int dmirror_migrate(struct dmirror *dmirror,
> args.start = addr;
> args.end = next;
> args.pgmap_owner = dmirror->mdevice;
> - args.flags = MIGRATE_VMA_SELECT_SYSTEM;
> + args.flags = (!cmd->alloc_to_devmem &&
> + dmirror->mdevice->zone_device_type ==
> + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
> + MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
> + MIGRATE_VMA_SELECT_SYSTEM;
> ret = migrate_vma_setup(&args);
> if (ret)
> goto out;
>
> - dmirror_migrate_alloc_and_copy(&args, dmirror);
> + if (cmd->alloc_to_devmem) {
> + pr_debug("Migrating from sys mem to device mem\n");
> + dmirror_migrate_alloc_and_copy(&args, dmirror);
> + } else {
> + pr_debug("Migrating from device mem to sys mem\n");
> + dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
> + }
> migrate_vma_pages(&args);
> - dmirror_migrate_finalize_and_map(&args, dmirror);
> + if (cmd->alloc_to_devmem)
> + dmirror_migrate_finalize_and_map(&args, dmirror);
> migrate_vma_finalize(&args);
> }
> mmap_read_unlock(mm);
> mmput(mm);
>
> - /* Return the migrated data for verification. */
> - ret = dmirror_bounce_init(&bounce, start, size);
> - if (ret)
> - return ret;
> - mutex_lock(&dmirror->mutex);
> - ret = dmirror_do_read(dmirror, start, end, &bounce);
> - mutex_unlock(&dmirror->mutex);
> - if (ret == 0) {
> - if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
> - bounce.size))
> - ret = -EFAULT;
> + /* Return the migrated data for verification. only for pages in device zone */
> + if (cmd->alloc_to_devmem) {
> + ret = dmirror_bounce_init(&bounce, start, size);
> + if (ret)
> + return ret;
> + mutex_lock(&dmirror->mutex);
> + ret = dmirror_do_read(dmirror, start, end, &bounce);
> + mutex_unlock(&dmirror->mutex);
> + if (ret == 0) {
> + if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
> + bounce.size))
> + ret = -EFAULT;
> + }
> + cmd->cpages = bounce.cpages;
> + dmirror_bounce_fini(&bounce);
> }
> - cmd->cpages = bounce.cpages;
> - dmirror_bounce_fini(&bounce);
> return ret;
Rather than passing a flag (alloc_to_devmem) can you split this into two
functions - one to migrate to device memory and one to migrate to system
memory?
> out:
> @@ -781,9 +849,15 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
> }
>
> page = hmm_pfn_to_page(entry);
> - if (is_device_private_page(page)) {
> - /* Is the page migrated to this device or some other? */
> - if (dmirror->mdevice == dmirror_page_to_device(page))
> + if (is_device_page(page)) {
> + /* Is page ZONE_DEVICE public? */
> + if (!is_device_private_page(page))
> + *perm = HMM_DMIRROR_PROT_DEV_PUBLIC;
> + /*
> + * Is page ZONE_DEVICE private migrated to
> + * this device or some other?
> + */
> + else if (dmirror->mdevice == dmirror_page_to_device(page))
> *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
> else
> *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
> @@ -1030,38 +1104,6 @@ static void dmirror_devmem_free(struct page *page)
> spin_unlock(&mdevice->lock);
> }
>
> -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
> - struct dmirror *dmirror)
> -{
> - const unsigned long *src = args->src;
> - unsigned long *dst = args->dst;
> - unsigned long start = args->start;
> - unsigned long end = args->end;
> - unsigned long addr;
> -
> - for (addr = start; addr < end; addr += PAGE_SIZE,
> - src++, dst++) {
> - struct page *dpage, *spage;
> -
> - spage = migrate_pfn_to_page(*src);
> - if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
> - continue;
> - spage = spage->zone_device_data;
> -
> - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
> - if (!dpage)
> - continue;
> -
> - lock_page(dpage);
> - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
> - copy_highpage(dpage, spage);
> - *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> - if (*src & MIGRATE_PFN_WRITE)
> - *dst |= MIGRATE_PFN_WRITE;
> - }
> - return 0;
> -}
> -
> static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
> {
> struct migrate_vma args;
> diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
> index 00259d994410..b6cb8a7d2470 100644
> --- a/lib/test_hmm_uapi.h
> +++ b/lib/test_hmm_uapi.h
> @@ -17,8 +17,12 @@
> * @addr: (in) user address the device will read/write
> * @ptr: (in) user address where device data is copied to/from
> * @npages: (in) number of pages to read/write
> + * @alloc_to_devmem: (in) desired allocation destination during migration.
> + * True if allocation is to device memory.
> + * False if allocation is to system memory.
> * @cpages: (out) number of pages copied
> * @faults: (out) number of device page faults seen
> + * @zone_device_type: (out) zone device memory type
> */
> struct hmm_dmirror_cmd {
> __u64 addr;
> @@ -26,7 +30,8 @@ struct hmm_dmirror_cmd {
> __u64 npages;
> __u64 cpages;
> __u64 faults;
> - __u64 zone_device_type;
> + __u32 zone_device_type;
> + __u32 alloc_to_devmem;
Similar comment here. Rather than add a boolean flag to every command could you
rename the existing command to HMM_DMIRROR_MIGRATE_TO_DEV and add another
(HMM_DMIRROR_MIGRATE_TO_SYS) for this new operation? I think that would end up
being a bit cleaner and matches how this actually gets used in hmm-test.c where
you end up defining hmm_migrate_sys_to_dev() and hmm_migrate_to_dev_sys()
anyway.
- Alistair
> };
>
> /* Expose the address space of the calling process through hmm device file */
> @@ -49,6 +54,8 @@ struct hmm_dmirror_cmd {
> * device the ioctl() is made
> * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some
> * other device
> + * HMM_DMIRROR_PROT_DEV_PUBLIC: Migrate device public page on the device
> + * the ioctl() is made
> */
> enum {
> HMM_DMIRROR_PROT_ERROR = 0xFF,
> @@ -60,6 +67,7 @@ enum {
> HMM_DMIRROR_PROT_ZERO = 0x10,
> HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20,
> HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
> + HMM_DMIRROR_PROT_DEV_PUBLIC = 0x40,
> };
>
> enum {
>
More information about the amd-gfx
mailing list