[PATCH v5 01/10] mm: add zone device coherent type memory support

Alistair Popple apopple at nvidia.com
Mon Jan 31 06:20:54 UTC 2022


Looks good, feel free to add:

Reviewed-by: Alistair Popple <apopple at nvidia.com>

On Saturday, 29 January 2022 7:08:16 AM AEDT Alex Sierra wrote:
> Device memory that is cache coherent from device and CPU point of view.
> This is used on platforms that have an advanced system bus (like CAPI
> or CXL). Any page of a process can be migrated to such memory. However,
> no one should be allowed to pin such memory so that it can always be
> evicted.
> 
> Signed-off-by: Alex Sierra <alex.sierra at amd.com>
> Acked-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
> v4:
> - use the same system entry path for coherent device pages at
> migrate_vma_insert_page.
> 
> - Add coherent device type support for try_to_migrate /
> try_to_migrate_one.
> ---
>  include/linux/memremap.h |  8 +++++++
>  include/linux/mm.h       | 16 ++++++++++++++
>  mm/memcontrol.c          |  6 +++---
>  mm/memory-failure.c      |  8 +++++--
>  mm/memremap.c            | 14 ++++++++++++-
>  mm/migrate.c             | 45 ++++++++++++++++++++--------------------
>  mm/rmap.c                |  5 +++--
>  7 files changed, 71 insertions(+), 31 deletions(-)
> 
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 1fafcc38acba..727b8c789193 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -39,6 +39,13 @@ struct vmem_altmap {
>   * A more complete discussion of unaddressable memory may be found in
>   * include/linux/hmm.h and Documentation/vm/hmm.rst.
>   *
> + * MEMORY_DEVICE_COHERENT:
> + * Device memory that is cache coherent from device and CPU point of view. This
> + * is used on platforms that have an advanced system bus (like CAPI or CXL). A
> + * driver can hotplug the device memory using ZONE_DEVICE and with that memory
> + * type. Any page of a process can be migrated to such memory. However no one
> + * should be allowed to pin such memory so that it can always be evicted.
> + *
>   * MEMORY_DEVICE_FS_DAX:
>   * Host memory that has similar access semantics as System RAM i.e. DMA
>   * coherent and supports page pinning. In support of coordinating page
> @@ -59,6 +66,7 @@ struct vmem_altmap {
>  enum memory_type {
>  	/* 0 is reserved to catch uninitialized type fields */
>  	MEMORY_DEVICE_PRIVATE = 1,
> +	MEMORY_DEVICE_COHERENT,
>  	MEMORY_DEVICE_FS_DAX,
>  	MEMORY_DEVICE_GENERIC,
>  	MEMORY_DEVICE_PCI_P2PDMA,
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index e1a84b1e6787..0c61bf40edef 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1106,6 +1106,7 @@ static inline bool page_is_devmap_managed(struct page *page)
>  		return false;
>  	switch (page->pgmap->type) {
>  	case MEMORY_DEVICE_PRIVATE:
> +	case MEMORY_DEVICE_COHERENT:
>  	case MEMORY_DEVICE_FS_DAX:
>  		return true;
>  	default:
> @@ -1135,6 +1136,21 @@ static inline bool is_device_private_page(const struct page *page)
>  		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
>  }
>  
> +static inline bool is_device_coherent_page(const struct page *page)
> +{
> +	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> +		is_zone_device_page(page) &&
> +		page->pgmap->type == MEMORY_DEVICE_COHERENT;
> +}
> +
> +static inline bool is_dev_private_or_coherent_page(const struct page *page)
> +{
> +	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> +		is_zone_device_page(page) &&
> +		(page->pgmap->type == MEMORY_DEVICE_PRIVATE ||
> +		page->pgmap->type == MEMORY_DEVICE_COHERENT);
> +}
> +
>  static inline bool is_pci_p2pdma_page(const struct page *page)
>  {
>  	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 09d342c7cbd0..0882b5b2a857 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5691,8 +5691,8 @@ static int mem_cgroup_move_account(struct page *page,
>   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
>   *     target for charge migration. if @target is not NULL, the entry is stored
>   *     in target->ent.
> - *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
> - *     (so ZONE_DEVICE page and thus not on the lru).
> + *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is device memory and
> + *   thus not on the lru.
>   *     For now we such page is charge like a regular page would be as for all
>   *     intent and purposes it is just special memory taking the place of a
>   *     regular page.
> @@ -5726,7 +5726,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
>  		 */
>  		if (page_memcg(page) == mc.from) {
>  			ret = MC_TARGET_PAGE;
> -			if (is_device_private_page(page))
> +			if (is_dev_private_or_coherent_page(page))
>  				ret = MC_TARGET_DEVICE;
>  			if (target)
>  				target->page = page;
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 14ae5c18e776..e83740f7f05e 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1611,12 +1611,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
>  		goto unlock;
>  	}
>  
> -	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
> +	switch (pgmap->type) {
> +	case MEMORY_DEVICE_PRIVATE:
> +	case MEMORY_DEVICE_COHERENT:
>  		/*
> -		 * TODO: Handle HMM pages which may need coordination
> +		 * TODO: Handle device pages which may need coordination
>  		 * with device-side memory.
>  		 */
>  		goto unlock;
> +	default:
> +		break;
>  	}
>  
>  	/*
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 6aa5f0c2d11f..354c8027823f 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key);
>  static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
>  {
>  	if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
> +	    pgmap->type == MEMORY_DEVICE_COHERENT ||
>  	    pgmap->type == MEMORY_DEVICE_FS_DAX)
>  		static_branch_dec(&devmap_managed_key);
>  }
> @@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
>  static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
>  {
>  	if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
> +	    pgmap->type == MEMORY_DEVICE_COHERENT ||
>  	    pgmap->type == MEMORY_DEVICE_FS_DAX)
>  		static_branch_inc(&devmap_managed_key);
>  }
> @@ -327,6 +329,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
>  			return ERR_PTR(-EINVAL);
>  		}
>  		break;
> +	case MEMORY_DEVICE_COHERENT:
> +		if (!pgmap->ops->page_free) {
> +			WARN(1, "Missing page_free method\n");
> +			return ERR_PTR(-EINVAL);
> +		}
> +		if (!pgmap->owner) {
> +			WARN(1, "Missing owner\n");
> +			return ERR_PTR(-EINVAL);
> +		}
> +		break;
>  	case MEMORY_DEVICE_FS_DAX:
>  		if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
>  		    IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
> @@ -469,7 +481,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
>  void free_devmap_managed_page(struct page *page)
>  {
>  	/* notify page idle for dax */
> -	if (!is_device_private_page(page)) {
> +	if (!is_dev_private_or_coherent_page(page)) {
>  		wake_up_var(&page->_refcount);
>  		return;
>  	}
> diff --git a/mm/migrate.c b/mm/migrate.c
> index c7da064b4781..cd137aedcfe5 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -345,7 +345,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
>  	 * Device private pages have an extra refcount as they are
>  	 * ZONE_DEVICE pages.
>  	 */
> -	expected_count += is_device_private_page(page);
> +	expected_count += is_dev_private_or_coherent_page(page);
>  	if (mapping)
>  		expected_count += compound_nr(page) + page_has_private(page);
>  
> @@ -2611,7 +2611,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
>   *     handle_pte_fault()
>   *       do_anonymous_page()
>   * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
> - * private page.
> + * private or coherent page.
>   */
>  static void migrate_vma_insert_page(struct migrate_vma *migrate,
>  				    unsigned long addr,
> @@ -2676,25 +2676,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
>  	 */
>  	__SetPageUptodate(page);
>  
> -	if (is_zone_device_page(page)) {
> -		if (is_device_private_page(page)) {
> -			swp_entry_t swp_entry;
> +	if (is_device_private_page(page)) {
> +		swp_entry_t swp_entry;
>  
> -			if (vma->vm_flags & VM_WRITE)
> -				swp_entry = make_writable_device_private_entry(
> -							page_to_pfn(page));
> -			else
> -				swp_entry = make_readable_device_private_entry(
> -							page_to_pfn(page));
> -			entry = swp_entry_to_pte(swp_entry);
> -		} else {
> -			/*
> -			 * For now we only support migrating to un-addressable
> -			 * device memory.
> -			 */
> -			pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
> -			goto abort;
> -		}
> +		if (vma->vm_flags & VM_WRITE)
> +			swp_entry = make_writable_device_private_entry(
> +						page_to_pfn(page));
> +		else
> +			swp_entry = make_readable_device_private_entry(
> +						page_to_pfn(page));
> +		entry = swp_entry_to_pte(swp_entry);
> +	} else if (is_zone_device_page(page) &&
> +		   !is_device_coherent_page(page)) {
> +		/*
> +		 * We support migrating to private and coherent types
> +		 * for device zone memory.
> +		 */
> +		pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
> +		goto abort;
>  	} else {
>  		entry = mk_pte(page, vma->vm_page_prot);
>  		if (vma->vm_flags & VM_WRITE)
> @@ -2796,10 +2795,10 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>  		mapping = page_mapping(page);
>  
>  		if (is_zone_device_page(newpage)) {
> -			if (is_device_private_page(newpage)) {
> +			if (is_dev_private_or_coherent_page(newpage)) {
>  				/*
> -				 * For now only support private anonymous when
> -				 * migrating to un-addressable device memory.
> +				 * For now only support private and coherent
> +				 * anonymous when migrating to device memory.
>  				 */
>  				if (mapping) {
>  					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 6a1e8c7f6213..f2bd5a3da72c 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1835,7 +1835,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
>  		/* Update high watermark before we lower rss */
>  		update_hiwater_rss(mm);
>  
> -		if (is_zone_device_page(page)) {
> +		if (is_device_private_page(page)) {
>  			unsigned long pfn = page_to_pfn(page);
>  			swp_entry_t entry;
>  			pte_t swp_pte;
> @@ -1976,7 +1976,8 @@ void try_to_migrate(struct page *page, enum ttu_flags flags)
>  					TTU_SYNC)))
>  		return;
>  
> -	if (is_zone_device_page(page) && !is_device_private_page(page))
> +	if (is_zone_device_page(page) &&
> +	    !is_dev_private_or_coherent_page(page))
>  		return;
>  
>  	/*
> 






More information about the amd-gfx mailing list