[PATCH v3 08/10] lib: add support for device coherent type in test_hmm

Alistair Popple apopple at nvidia.com
Thu Jan 20 06:00:50 UTC 2022


On Tuesday, 11 January 2022 9:31:59 AM AEDT Alex Sierra wrote:
> Device Coherent type uses device memory that is coherently accesible by
> the CPU. This could be shown as SP (special purpose) memory range
> at the BIOS-e820 memory enumeration. If no SP memory is supported in
> system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.
> 
> Currently, test_hmm only supports two different SP ranges of at least
> 256MB size. This could be specified in the kernel parameter variable
> efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 &
> 0x140000000 physical address. Ex.
> efi_fake_mem=1G at 0x100000000:0x40000,1G at 0x140000000:0x40000
> 
> Private and coherent device mirror instances can be created in the same
> probed. This is done by passing the module parameters spm_addr_dev0 &
> spm_addr_dev1. In this case, it will create four instances of
> device_mirror. The first two correspond to private device type, the
> last two to coherent type. Then, they can be easily accessed from user
> space through /dev/hmm_mirror<num_device>. Usually num_device 0 and 1
> are for private, and 2 and 3 for coherent types. If no module
> parameters are passed, two instances of private type device_mirror will
> be created only.
> 
> Signed-off-by: Alex Sierra <alex.sierra at amd.com>
> ---
>  lib/test_hmm.c      | 247 ++++++++++++++++++++++++++++++++------------
>  lib/test_hmm_uapi.h |  15 ++-
>  2 files changed, 193 insertions(+), 69 deletions(-)
> 
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index 9edeff52302e..7c641c5a9cfa 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -29,11 +29,22 @@
>  
>  #include "test_hmm_uapi.h"
>  
> -#define DMIRROR_NDEVICES		2
> +#define DMIRROR_NDEVICES		4
>  #define DMIRROR_RANGE_FAULT_TIMEOUT	1000
>  #define DEVMEM_CHUNK_SIZE		(256 * 1024 * 1024U)
>  #define DEVMEM_CHUNKS_RESERVE		16
>  
> +/*
> + * For device_private pages, dpage is just a dummy struct page
> + * representing a piece of device memory. dmirror_devmem_alloc_page
> + * allocates a real system memory page as backing storage to fake a
> + * real device. zone_device_data points to that backing page. But
> + * for device_coherent memory, the struct page represents real
> + * physical CPU-accessible memory that we can use directly.
> + */
> +#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
> +			   (page)->zone_device_data : (page))
> +
>  static unsigned long spm_addr_dev0;
>  module_param(spm_addr_dev0, long, 0644);
>  MODULE_PARM_DESC(spm_addr_dev0,
> @@ -122,6 +133,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce,
>  	return 0;
>  }
>  
> +static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
> +{
> +	return (mdevice->zone_device_type ==
> +		HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
> +}
> +
> +static enum migrate_vma_direction
> +	dmirror_select_device(struct dmirror *dmirror)
> +{
> +	return (dmirror->mdevice->zone_device_type ==
> +		HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
> +		MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
> +		MIGRATE_VMA_SELECT_DEVICE_COHERENT;
> +}
> +
>  static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
>  {
>  	vfree(bounce->ptr);
> @@ -572,16 +598,19 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
>  static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
>  {
>  	struct page *dpage = NULL;
> -	struct page *rpage;
> +	struct page *rpage = NULL;
>  
>  	/*
> -	 * This is a fake device so we alloc real system memory to store
> -	 * our device memory.
> +	 * For ZONE_DEVICE private type, this is a fake device so we alloc real
> +	 * system memory to store our device memory.
> +	 * For ZONE_DEVICE coherent type we use the actual dpage to store the data
> +	 * and ignore rpage.
>  	 */
> -	rpage = alloc_page(GFP_HIGHUSER);
> -	if (!rpage)
> -		return NULL;
> -
> +	if (dmirror_is_private_zone(mdevice)) {
> +		rpage = alloc_page(GFP_HIGHUSER);
> +		if (!rpage)
> +			return NULL;
> +	}
>  	spin_lock(&mdevice->lock);
>  
>  	if (mdevice->free_pages) {
> @@ -601,7 +630,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
>  	return dpage;
>  
>  error:
> -	__free_page(rpage);
> +	if (rpage)
> +		__free_page(rpage);
>  	return NULL;
>  }
>  
> @@ -627,12 +657,15 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
>  		 * unallocated pte_none() or read-only zero page.
>  		 */
>  		spage = migrate_pfn_to_page(*src);
> +		WARN(spage && is_zone_device_page(spage),
> +		     "page already in device spage pfn: 0x%lx\n",
> +		     page_to_pfn(spage));

This should also lead to test failure because we are only supposed to be
selecting system pages. Ie.

if (WARN_ON_ONCE()) {
	dpage = NULL;
	continue;
}

>  		dpage = dmirror_devmem_alloc_page(mdevice);
>  		if (!dpage)
>  			continue;
>  
> -		rpage = dpage->zone_device_data;
> +		rpage = BACKING_PAGE(dpage);
>  		if (spage)
>  			copy_highpage(rpage, spage);
>  		else
> @@ -646,6 +679,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
>  		 */
>  		rpage->zone_device_data = dmirror;
>  
> +		pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
> +			 page_to_pfn(spage), page_to_pfn(dpage));
>  		*dst = migrate_pfn(page_to_pfn(dpage)) |
>  			    MIGRATE_PFN_LOCKED;
>  		if ((*src & MIGRATE_PFN_WRITE) ||
> @@ -724,11 +759,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
>  		if (!dpage)
>  			continue;
>  
> -		/*
> -		 * Store the page that holds the data so the page table
> -		 * doesn't have to deal with ZONE_DEVICE private pages.
> -		 */
> -		entry = dpage->zone_device_data;
> +		entry = BACKING_PAGE(dpage);
>  		if (*dst & MIGRATE_PFN_WRITE)
>  			entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
>  		entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
> @@ -808,8 +839,101 @@ static int dmirror_exclusive(struct dmirror *dmirror,
>  	return ret;
>  }
>  
> -static int dmirror_migrate(struct dmirror *dmirror,
> -			   struct hmm_dmirror_cmd *cmd)
> +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
> +						      struct dmirror *dmirror)
> +{
> +	const unsigned long *src = args->src;
> +	unsigned long *dst = args->dst;
> +	unsigned long start = args->start;
> +	unsigned long end = args->end;
> +	unsigned long addr;
> +
> +	for (addr = start; addr < end; addr += PAGE_SIZE,
> +				       src++, dst++) {
> +		struct page *dpage, *spage;
> +
> +		spage = migrate_pfn_to_page(*src);
> +		if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
> +			continue;
> +
> +		WARN_ON(!is_device_page(spage));

Same - the test should fail in this case (ie. by not migrating the page).

> +		spage = BACKING_PAGE(spage);
> +		dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
> +		if (!dpage)
> +			continue;
> +		pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
> +			 page_to_pfn(spage), page_to_pfn(dpage));
> +
> +		lock_page(dpage);
> +		xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
> +		copy_highpage(dpage, spage);
> +		*dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> +		if (*src & MIGRATE_PFN_WRITE)
> +			*dst |= MIGRATE_PFN_WRITE;
> +	}
> +	return 0;
> +}
> +
> +static int dmirror_migrate_to_system(struct dmirror *dmirror,
> +				     struct hmm_dmirror_cmd *cmd)
> +{
> +	unsigned long start, end, addr;
> +	unsigned long size = cmd->npages << PAGE_SHIFT;
> +	struct mm_struct *mm = dmirror->notifier.mm;
> +	struct vm_area_struct *vma;
> +	unsigned long src_pfns[64];
> +	unsigned long dst_pfns[64];
> +	struct migrate_vma args;
> +	unsigned long next;
> +	int ret;
> +
> +	start = cmd->addr;
> +	end = start + size;
> +	if (end < start)
> +		return -EINVAL;
> +
> +	/* Since the mm is for the mirrored process, get a reference first. */
> +	if (!mmget_not_zero(mm))
> +		return -EINVAL;
> +
> +	mmap_read_lock(mm);
> +	for (addr = start; addr < end; addr = next) {
> +		vma = vma_lookup(mm, addr);
> +		if (!vma || !(vma->vm_flags & VM_READ)) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +		next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
> +		if (next > vma->vm_end)
> +			next = vma->vm_end;
> +
> +		args.vma = vma;
> +		args.src = src_pfns;
> +		args.dst = dst_pfns;
> +		args.start = addr;
> +		args.end = next;
> +		args.pgmap_owner = dmirror->mdevice;
> +		args.flags = dmirror_select_device(dmirror);
> +
> +		ret = migrate_vma_setup(&args);
> +		if (ret)
> +			goto out;
> +
> +		pr_debug("Migrating from device mem to sys mem\n");
> +		dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
> +
> +		migrate_vma_pages(&args);
> +		migrate_vma_finalize(&args);
> +	}
> +out:
> +	mmap_read_unlock(mm);
> +	mmput(mm);
> +	return ret;

I think you need to return the number of pages successfully migrated otherwise
failures migrating back to system memory will go unnoticed. This isn't a
problem for device private pages because in that case pages are migrated back
in response to a fault, and if they fail the process will get killed with
SIGBUS.

> +}
> +
> +static int dmirror_migrate_to_device(struct dmirror *dmirror,
> +				struct hmm_dmirror_cmd *cmd)
>  {
>  	unsigned long start, end, addr;
>  	unsigned long size = cmd->npages << PAGE_SHIFT;
> @@ -853,6 +977,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
>  		if (ret)
>  			goto out;
>  
> +		pr_debug("Migrating from sys mem to device mem\n");
>  		dmirror_migrate_alloc_and_copy(&args, dmirror);
>  		migrate_vma_pages(&args);
>  		dmirror_migrate_finalize_and_map(&args, dmirror);
> @@ -861,7 +986,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
>  	mmap_read_unlock(mm);
>  	mmput(mm);
>  
> -	/* Return the migrated data for verification. */
> +	/* Return the migrated data for verification. only for pages in device zone */
>  	ret = dmirror_bounce_init(&bounce, start, size);
>  	if (ret)
>  		return ret;
> @@ -898,12 +1023,22 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
>  	}
>  
>  	page = hmm_pfn_to_page(entry);
> -	if (is_device_private_page(page)) {
> -		/* Is the page migrated to this device or some other? */
> -		if (dmirror->mdevice == dmirror_page_to_device(page))
> +	if (is_device_page(page)) {
> +		/* Is page ZONE_DEVICE coherent? */
> +		if (!is_device_private_page(page)) {

This would read more clearly without the negation -
ie. `if (is_device_coherent_page(page))`.

> +			if (dmirror->mdevice == dmirror_page_to_device(page))
> +				*perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL;
> +			else
> +				*perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE;
> +		/*
> +		 * Is page ZONE_DEVICE private migrated to
> +		 * this device or some other?
> +		 */
> +		} else if (dmirror->mdevice == dmirror_page_to_device(page)) {
>  			*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
> -		else
> +		} else {
>  			*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
> +		}
>  	} else if (is_zero_pfn(page_to_pfn(page)))
>  		*perm = HMM_DMIRROR_PROT_ZERO;
>  	else
> @@ -1100,8 +1235,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
>  		ret = dmirror_write(dmirror, &cmd);
>  		break;
>  
> -	case HMM_DMIRROR_MIGRATE:
> -		ret = dmirror_migrate(dmirror, &cmd);
> +	case HMM_DMIRROR_MIGRATE_TO_DEV:
> +		ret = dmirror_migrate_to_device(dmirror, &cmd);
> +		break;
> +
> +	case HMM_DMIRROR_MIGRATE_TO_SYS:
> +		ret = dmirror_migrate_to_system(dmirror, &cmd);
>  		break;
>  
>  	case HMM_DMIRROR_EXCLUSIVE:
> @@ -1142,14 +1281,13 @@ static const struct file_operations dmirror_fops = {
>  
>  static void dmirror_devmem_free(struct page *page)
>  {
> -	struct page *rpage = page->zone_device_data;
> +	struct page *rpage = BACKING_PAGE(page);
>  	struct dmirror_device *mdevice;
>  
> -	if (rpage)
> +	if (rpage != page)
>  		__free_page(rpage);
>  
>  	mdevice = dmirror_page_to_device(page);
> -
>  	spin_lock(&mdevice->lock);
>  	mdevice->cfree++;
>  	page->zone_device_data = mdevice->free_pages;
> @@ -1157,38 +1295,6 @@ static void dmirror_devmem_free(struct page *page)
>  	spin_unlock(&mdevice->lock);
>  }
>  
> -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
> -						      struct dmirror *dmirror)
> -{
> -	const unsigned long *src = args->src;
> -	unsigned long *dst = args->dst;
> -	unsigned long start = args->start;
> -	unsigned long end = args->end;
> -	unsigned long addr;
> -
> -	for (addr = start; addr < end; addr += PAGE_SIZE,
> -				       src++, dst++) {
> -		struct page *dpage, *spage;
> -
> -		spage = migrate_pfn_to_page(*src);
> -		if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
> -			continue;
> -		spage = spage->zone_device_data;
> -
> -		dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
> -		if (!dpage)
> -			continue;
> -
> -		lock_page(dpage);
> -		xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
> -		copy_highpage(dpage, spage);
> -		*dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> -		if (*src & MIGRATE_PFN_WRITE)
> -			*dst |= MIGRATE_PFN_WRITE;
> -	}
> -	return 0;
> -}
> -
>  static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>  {
>  	struct migrate_vma args;
> @@ -1203,7 +1309,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>  	 * the mirror but here we use it to hold the page for the simulated
>  	 * device memory and that page holds the pointer to the mirror.
>  	 */
> -	rpage = vmf->page->zone_device_data;
> +	rpage = BACKING_PAGE(vmf->page);

Note that as I understand things this isn't strictly required because this will
never be called for device coherent pages.

>  	dmirror = rpage->zone_device_data;
>  
>  	/* FIXME demonstrate how we can adjust migrate range */
> @@ -1213,7 +1319,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>  	args.src = &src_pfns;
>  	args.dst = &dst_pfns;
>  	args.pgmap_owner = dmirror->mdevice;
> -	args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
> +	args.flags = dmirror_select_device(dmirror);
>  
>  	if (migrate_vma_setup(&args))
>  		return VM_FAULT_SIGBUS;
> @@ -1279,14 +1385,26 @@ static void dmirror_device_remove(struct dmirror_device *mdevice)
>  static int __init hmm_dmirror_init(void)
>  {
>  	int ret;
> -	int id;
> +	int id = 0;
> +	int ndevices = 0;
>  
>  	ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES,
>  				  "HMM_DMIRROR");
>  	if (ret)
>  		goto err_unreg;
>  
> -	for (id = 0; id < DMIRROR_NDEVICES; id++) {
> +	memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0]));
> +	dmirror_devices[ndevices++].zone_device_type =
> +				HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
> +	dmirror_devices[ndevices++].zone_device_type =
> +				HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
> +	if (spm_addr_dev0 && spm_addr_dev1) {
> +		dmirror_devices[ndevices++].zone_device_type =
> +					HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
> +		dmirror_devices[ndevices++].zone_device_type =
> +					HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
> +	}
> +	for (id = 0; id < ndevices; id++) {
>  		ret = dmirror_device_init(dmirror_devices + id, id);
>  		if (ret)
>  			goto err_chrdev;
> @@ -1308,7 +1426,8 @@ static void __exit hmm_dmirror_exit(void)
>  	int id;
>  
>  	for (id = 0; id < DMIRROR_NDEVICES; id++)
> -		dmirror_device_remove(dmirror_devices + id);
> +		if (dmirror_devices[id].zone_device_type)

Oh, I understand why you reserved 0 for zone_device_type now. But I still
thinks it's unnecessary - hmm_dmirror_exit() getting called implies
hmm_dmirror_init() was successful which implies zone_device_type was
initialised for all devices.

> +			dmirror_device_remove(dmirror_devices + id);
>  	unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
>  }
>  
> diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
> index 625f3690d086..e190b2ab6f19 100644
> --- a/lib/test_hmm_uapi.h
> +++ b/lib/test_hmm_uapi.h
> @@ -33,11 +33,12 @@ struct hmm_dmirror_cmd {
>  /* Expose the address space of the calling process through hmm device file */
>  #define HMM_DMIRROR_READ		_IOWR('H', 0x00, struct hmm_dmirror_cmd)
>  #define HMM_DMIRROR_WRITE		_IOWR('H', 0x01, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_MIGRATE		_IOWR('H', 0x02, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_SNAPSHOT		_IOWR('H', 0x03, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_EXCLUSIVE		_IOWR('H', 0x04, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_CHECK_EXCLUSIVE	_IOWR('H', 0x05, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_GET_MEM_DEV_TYPE	_IOWR('H', 0x06, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_MIGRATE_TO_DEV	_IOWR('H', 0x02, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_MIGRATE_TO_SYS	_IOWR('H', 0x03, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_SNAPSHOT		_IOWR('H', 0x04, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_EXCLUSIVE		_IOWR('H', 0x05, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_CHECK_EXCLUSIVE	_IOWR('H', 0x06, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_GET_MEM_DEV_TYPE	_IOWR('H', 0x07, struct hmm_dmirror_cmd)
>  
>  /*
>   * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
> @@ -52,6 +53,8 @@ struct hmm_dmirror_cmd {
>   *					device the ioctl() is made
>   * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some
>   *					other device
> + * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device
> + *				  the ioctl() is made
>   */
>  enum {
>  	HMM_DMIRROR_PROT_ERROR			= 0xFF,
> @@ -63,6 +66,8 @@ enum {
>  	HMM_DMIRROR_PROT_ZERO			= 0x10,
>  	HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL	= 0x20,
>  	HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE	= 0x30,
> +	HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL	= 0x40,
> +	HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE	= 0x50,
>  };
>  
>  enum {
> 






More information about the amd-gfx mailing list