[v3 03/11] mm/migrate_device: THP migration of zone device pages

Balbir Singh balbirs at nvidia.com
Tue Aug 12 23:36:10 UTC 2025


On 8/12/25 15:35, Mika Penttilä wrote:
> Hi,
> 
> On 8/12/25 05:40, Balbir Singh wrote:
> 
>> MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during
>> migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating
>> device pages as compound pages during device pfn migration.
>>
>> migrate_device code paths go through the collect, setup
>> and finalize phases of migration.
>>
>> The entries in src and dst arrays passed to these functions still
>> remain at a PAGE_SIZE granularity. When a compound page is passed,
>> the first entry has the PFN along with MIGRATE_PFN_COMPOUND
>> and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the
>> remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This
>> representation allows for the compound page to be split into smaller
>> page sizes.
>>
>> migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP
>> page aware. Two new helper functions migrate_vma_collect_huge_pmd()
>> and migrate_vma_insert_huge_pmd_page() have been added.
>>
>> migrate_vma_collect_huge_pmd() can collect THP pages, but if for
>> some reason this fails, there is fallback support to split the folio
>> and migrate it.
>>
>> migrate_vma_insert_huge_pmd_page() closely follows the logic of
>> migrate_vma_insert_page()
>>
>> Support for splitting pages as needed for migration will follow in
>> later patches in this series.
>>
>> Cc: Andrew Morton <akpm at linux-foundation.org>
>> Cc: David Hildenbrand <david at redhat.com>
>> Cc: Zi Yan <ziy at nvidia.com>
>> Cc: Joshua Hahn <joshua.hahnjy at gmail.com>
>> Cc: Rakie Kim <rakie.kim at sk.com>
>> Cc: Byungchul Park <byungchul at sk.com>
>> Cc: Gregory Price <gourry at gourry.net>
>> Cc: Ying Huang <ying.huang at linux.alibaba.com>
>> Cc: Alistair Popple <apopple at nvidia.com>
>> Cc: Oscar Salvador <osalvador at suse.de>
>> Cc: Lorenzo Stoakes <lorenzo.stoakes at oracle.com>
>> Cc: Baolin Wang <baolin.wang at linux.alibaba.com>
>> Cc: "Liam R. Howlett" <Liam.Howlett at oracle.com>
>> Cc: Nico Pache <npache at redhat.com>
>> Cc: Ryan Roberts <ryan.roberts at arm.com>
>> Cc: Dev Jain <dev.jain at arm.com>
>> Cc: Barry Song <baohua at kernel.org>
>> Cc: Lyude Paul <lyude at redhat.com>
>> Cc: Danilo Krummrich <dakr at kernel.org>
>> Cc: David Airlie <airlied at gmail.com>
>> Cc: Simona Vetter <simona at ffwll.ch>
>> Cc: Ralph Campbell <rcampbell at nvidia.com>
>> Cc: Mika Penttilä <mpenttil at redhat.com>
>> Cc: Matthew Brost <matthew.brost at intel.com>
>> Cc: Francois Dugast <francois.dugast at intel.com>
>>
>> Signed-off-by: Balbir Singh <balbirs at nvidia.com>
>> ---
>>  include/linux/migrate.h |   2 +
>>  mm/migrate_device.c     | 457 ++++++++++++++++++++++++++++++++++------
>>  2 files changed, 396 insertions(+), 63 deletions(-)
>>
>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>> index acadd41e0b5c..d9cef0819f91 100644
>> --- a/include/linux/migrate.h
>> +++ b/include/linux/migrate.h
>> @@ -129,6 +129,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node)
>>  #define MIGRATE_PFN_VALID	(1UL << 0)
>>  #define MIGRATE_PFN_MIGRATE	(1UL << 1)
>>  #define MIGRATE_PFN_WRITE	(1UL << 3)
>> +#define MIGRATE_PFN_COMPOUND	(1UL << 4)
>>  #define MIGRATE_PFN_SHIFT	6
>>  
>>  static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
>> @@ -147,6 +148,7 @@ enum migrate_vma_direction {
>>  	MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
>>  	MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
>>  	MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
>> +	MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
>>  };
>>  
>>  struct migrate_vma {
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index 0ed337f94fcd..6621bba62710 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -14,6 +14,7 @@
>>  #include <linux/pagewalk.h>
>>  #include <linux/rmap.h>
>>  #include <linux/swapops.h>
>> +#include <asm/pgalloc.h>
>>  #include <asm/tlbflush.h>
>>  #include "internal.h"
>>  
>> @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
>>  	if (!vma_is_anonymous(walk->vma))
>>  		return migrate_vma_collect_skip(start, end, walk);
>>  
>> +	if (thp_migration_supported() &&
>> +		(migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
>> +		(IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
>> +		 IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
>> +		migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
>> +						MIGRATE_PFN_COMPOUND;
>> +		migrate->dst[migrate->npages] = 0;
>> +		migrate->npages++;
>> +		migrate->cpages++;
>> +
>> +		/*
>> +		 * Collect the remaining entries as holes, in case we
>> +		 * need to split later
>> +		 */
>> +		return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
>> +	}
>> +
>>  	for (addr = start; addr < end; addr += PAGE_SIZE) {
>>  		migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
>>  		migrate->dst[migrate->npages] = 0;
>> @@ -54,57 +72,151 @@ static int migrate_vma_collect_hole(unsigned long start,
>>  	return 0;
>>  }
>>  
>> -static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> -				   unsigned long start,
>> -				   unsigned long end,
>> -				   struct mm_walk *walk)
>> +/**
>> + * migrate_vma_collect_huge_pmd - collect THP pages without splitting the
>> + * folio for device private pages.
>> + * @pmdp: pointer to pmd entry
>> + * @start: start address of the range for migration
>> + * @end: end address of the range for migration
>> + * @walk: mm_walk callback structure
>> + *
>> + * Collect the huge pmd entry at @pmdp for migration and set the
>> + * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that
>> + * migration will occur at HPAGE_PMD granularity
>> + */
>> +static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
>> +					unsigned long end, struct mm_walk *walk,
>> +					struct folio *fault_folio)
>>  {
>> +	struct mm_struct *mm = walk->mm;
>> +	struct folio *folio;
>>  	struct migrate_vma *migrate = walk->private;
>> -	struct folio *fault_folio = migrate->fault_page ?
>> -		page_folio(migrate->fault_page) : NULL;
>> -	struct vm_area_struct *vma = walk->vma;
>> -	struct mm_struct *mm = vma->vm_mm;
>> -	unsigned long addr = start, unmapped = 0;
>>  	spinlock_t *ptl;
>> -	pte_t *ptep;
>> +	swp_entry_t entry;
>> +	int ret;
>> +	unsigned long write = 0;
>>  
>> -again:
>> -	if (pmd_none(*pmdp))
>> +	ptl = pmd_lock(mm, pmdp);
>> +	if (pmd_none(*pmdp)) {
>> +		spin_unlock(ptl);
>>  		return migrate_vma_collect_hole(start, end, -1, walk);
>> +	}
>>  
>>  	if (pmd_trans_huge(*pmdp)) {
>> -		struct folio *folio;
>> -
>> -		ptl = pmd_lock(mm, pmdp);
>> -		if (unlikely(!pmd_trans_huge(*pmdp))) {
>> +		if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
>>  			spin_unlock(ptl);
>> -			goto again;
>> +			return migrate_vma_collect_skip(start, end, walk);
>>  		}
>>  
>>  		folio = pmd_folio(*pmdp);
>>  		if (is_huge_zero_folio(folio)) {
>>  			spin_unlock(ptl);
>> -			split_huge_pmd(vma, pmdp, addr);
>> -		} else {
>> -			int ret;
>> +			return migrate_vma_collect_hole(start, end, -1, walk);
>> +		}
>> +		if (pmd_write(*pmdp))
>> +			write = MIGRATE_PFN_WRITE;
>> +	} else if (!pmd_present(*pmdp)) {
>> +		entry = pmd_to_swp_entry(*pmdp);
>> +		folio = pfn_swap_entry_folio(entry);
>> +
>> +		if (!is_device_private_entry(entry) ||
>> +			!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
>> +			(folio->pgmap->owner != migrate->pgmap_owner)) {
>> +			spin_unlock(ptl);
>> +			return migrate_vma_collect_skip(start, end, walk);
>> +		}
>>  
>> -			folio_get(folio);
>> +		if (is_migration_entry(entry)) {
>> +			migration_entry_wait_on_locked(entry, ptl);
>>  			spin_unlock(ptl);
>> -			/* FIXME: we don't expect THP for fault_folio */
>> -			if (WARN_ON_ONCE(fault_folio == folio))
>> -				return migrate_vma_collect_skip(start, end,
>> -								walk);
>> -			if (unlikely(!folio_trylock(folio)))
>> -				return migrate_vma_collect_skip(start, end,
>> -								walk);
>> -			ret = split_folio(folio);
>> -			if (fault_folio != folio)
>> -				folio_unlock(folio);
>> -			folio_put(folio);
>> -			if (ret)
>> -				return migrate_vma_collect_skip(start, end,
>> -								walk);
>> +			return -EAGAIN;
>>  		}
>> +
>> +		if (is_writable_device_private_entry(entry))
>> +			write = MIGRATE_PFN_WRITE;
>> +	} else {
>> +		spin_unlock(ptl);
>> +		return -EAGAIN;
>> +	}
>> +
>> +	folio_get(folio);
>> +	if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
>> +		spin_unlock(ptl);
>> +		folio_put(folio);
>> +		return migrate_vma_collect_skip(start, end, walk);
>> +	}
>> +
>> +	if (thp_migration_supported() &&
>> +		(migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
>> +		(IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
>> +		 IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
>> +
>> +		struct page_vma_mapped_walk pvmw = {
>> +			.ptl = ptl,
>> +			.address = start,
>> +			.pmd = pmdp,
>> +			.vma = walk->vma,
>> +		};
>> +
>> +		unsigned long pfn = page_to_pfn(folio_page(folio, 0));
>> +
>> +		migrate->src[migrate->npages] = migrate_pfn(pfn) | write
>> +						| MIGRATE_PFN_MIGRATE
>> +						| MIGRATE_PFN_COMPOUND;
>> +		migrate->dst[migrate->npages++] = 0;
>> +		migrate->cpages++;
>> +		ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
>> +		if (ret) {
>> +			migrate->npages--;
>> +			migrate->cpages--;
>> +			migrate->src[migrate->npages] = 0;
>> +			migrate->dst[migrate->npages] = 0;
>> +			goto fallback;
>> +		}
>> +		migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
>> +		spin_unlock(ptl);
>> +		return 0;
>> +	}
>> +
>> +fallback:
>> +	spin_unlock(ptl);
>> +	if (!folio_test_large(folio))
>> +		goto done;
>> +	ret = split_folio(folio);
>> +	if (fault_folio != folio)
>> +		folio_unlock(folio);
>> +	folio_put(folio);
>> +	if (ret)
>> +		return migrate_vma_collect_skip(start, end, walk);
>> +	if (pmd_none(pmdp_get_lockless(pmdp)))
>> +		return migrate_vma_collect_hole(start, end, -1, walk);
>> +
>> +done:
>> +	return -ENOENT;
>> +}
>> +
>> +static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> +				   unsigned long start,
>> +				   unsigned long end,
>> +				   struct mm_walk *walk)
>> +{
>> +	struct migrate_vma *migrate = walk->private;
>> +	struct vm_area_struct *vma = walk->vma;
>> +	struct mm_struct *mm = vma->vm_mm;
>> +	unsigned long addr = start, unmapped = 0;
>> +	spinlock_t *ptl;
>> +	struct folio *fault_folio = migrate->fault_page ?
>> +		page_folio(migrate->fault_page) : NULL;
>> +	pte_t *ptep;
>> +
>> +again:
>> +	if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) {
>> +		int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio);
>> +
>> +		if (ret == -EAGAIN)
>> +			goto again;
>> +		if (ret == 0)
>> +			return 0;
>>  	}
>>  
>>  	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
>> @@ -222,8 +334,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>  			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
>>  		}
>>  
>> -		/* FIXME support THP */
>> -		if (!page || !page->mapping || PageTransCompound(page)) {
>> +		if (!page || !page->mapping) {
>>  			mpfn = 0;
>>  			goto next;
>>  		}
>> @@ -394,14 +505,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
>>  	 */
>>  	int extra = 1 + (page == fault_page);
>>  
>> -	/*
>> -	 * FIXME support THP (transparent huge page), it is bit more complex to
>> -	 * check them than regular pages, because they can be mapped with a pmd
>> -	 * or with a pte (split pte mapping).
>> -	 */
>> -	if (folio_test_large(folio))
>> -		return false;
>> -
> 
> You cannot remove this check unless support normal mTHP folios migrate to device, 
> which I think this series doesn't do, but maybe should?
> 

mTHP needs to be a follow up series, there are comments in the cover letter. I had an
assert earlier to prevent other folio sizes, but I've removed it and the interface
to drivers does not allow for mTHP device private folios.

Balbir


More information about the dri-devel mailing list