[PATCH v5 2/9] mm: Add an apply_to_pfn_range interface

Thomas Hellström (VMware) thellstrom at vmwopensource.org
Wed Jun 12 12:26:45 UTC 2019


On 6/12/19 2:16 PM, Christoph Hellwig wrote:
> On Wed, Jun 12, 2019 at 08:42:36AM +0200, Thomas Hellström (VMware) wrote:
>> From: Thomas Hellstrom <thellstrom at vmware.com>
>>
>> This is basically apply_to_page_range with added functionality:
>> Allocating missing parts of the page table becomes optional, which
>> means that the function can be guaranteed not to error if allocation
>> is disabled. Also passing of the closure struct and callback function
>> becomes different and more in line with how things are done elsewhere.
>>
>> Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range
>>
>> The reason for not using the page-walk code is that we want to perform
>> the page-walk on vmas pointing to an address space without requiring the
>> mmap_sem to be held rather than on vmas belonging to a process with the
>> mmap_sem held.
>>
>> Notable changes since RFC:
>> Don't export apply_to_pfn range.
>>
>> Cc: Andrew Morton <akpm at linux-foundation.org>
>> Cc: Matthew Wilcox <willy at infradead.org>
>> Cc: Will Deacon <will.deacon at arm.com>
>> Cc: Peter Zijlstra <peterz at infradead.org>
>> Cc: Rik van Riel <riel at surriel.com>
>> Cc: Minchan Kim <minchan at kernel.org>
>> Cc: Michal Hocko <mhocko at suse.com>
>> Cc: Huang Ying <ying.huang at intel.com>
>> Cc: Souptick Joarder <jrdr.linux at gmail.com>
>> Cc: "Jérôme Glisse" <jglisse at redhat.com>
>> Cc: linux-mm at kvack.org
>> Cc: linux-kernel at vger.kernel.org
>>
>> Signed-off-by: Thomas Hellstrom <thellstrom at vmware.com>
>> Reviewed-by: Ralph Campbell <rcampbell at nvidia.com> #v1
>> ---
>>   include/linux/mm.h |  10 ++++
>>   mm/memory.c        | 135 ++++++++++++++++++++++++++++++++++-----------
>>   2 files changed, 113 insertions(+), 32 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 0e8834ac32b7..3d06ce2a64af 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -2675,6 +2675,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
>>   extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
>>   			       unsigned long size, pte_fn_t fn, void *data);
>>   
>> +struct pfn_range_apply;
>> +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
>> +			 struct pfn_range_apply *closure);
>> +struct pfn_range_apply {
>> +	struct mm_struct *mm;
>> +	pter_fn_t ptefn;
>> +	unsigned int alloc;
>> +};
>> +extern int apply_to_pfn_range(struct pfn_range_apply *closure,
>> +			      unsigned long address, unsigned long size);
>>   
>>   #ifdef CONFIG_PAGE_POISONING
>>   extern bool page_poisoning_enabled(void);
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 168f546af1ad..462aa47f8878 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2032,18 +2032,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
>>   }
>>   EXPORT_SYMBOL(vm_iomap_memory);
>>   
>> -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
>> -				     unsigned long addr, unsigned long end,
>> -				     pte_fn_t fn, void *data)
>> +static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
>> +			      unsigned long addr, unsigned long end)
>>   {
>>   	pte_t *pte;
>>   	int err;
>>   	pgtable_t token;
>>   	spinlock_t *uninitialized_var(ptl);
>>   
>> -	pte = (mm == &init_mm) ?
>> +	pte = (closure->mm == &init_mm) ?
>>   		pte_alloc_kernel(pmd, addr) :
>> -		pte_alloc_map_lock(mm, pmd, addr, &ptl);
>> +		pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
>>   	if (!pte)
>>   		return -ENOMEM;
>>   
>> @@ -2054,86 +2053,109 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
>>   	token = pmd_pgtable(*pmd);
>>   
>>   	do {
>> -		err = fn(pte++, token, addr, data);
>> +		err = closure->ptefn(pte++, token, addr, closure);
>>   		if (err)
>>   			break;
>>   	} while (addr += PAGE_SIZE, addr != end);
>>   
>>   	arch_leave_lazy_mmu_mode();
>>   
>> -	if (mm != &init_mm)
>> +	if (closure->mm != &init_mm)
>>   		pte_unmap_unlock(pte-1, ptl);
>>   	return err;
>>   }
>>   
>> -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
>> -				     unsigned long addr, unsigned long end,
>> -				     pte_fn_t fn, void *data)
>> +static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
>> +			      unsigned long addr, unsigned long end)
>>   {
>>   	pmd_t *pmd;
>>   	unsigned long next;
>> -	int err;
>> +	int err = 0;
>>   
>>   	BUG_ON(pud_huge(*pud));
>>   
>> -	pmd = pmd_alloc(mm, pud, addr);
>> +	pmd = pmd_alloc(closure->mm, pud, addr);
>>   	if (!pmd)
>>   		return -ENOMEM;
>> +
>>   	do {
>>   		next = pmd_addr_end(addr, end);
>> -		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
>> +		if (!closure->alloc && pmd_none_or_clear_bad(pmd))
>> +			continue;
>> +		err = apply_to_pte_range(closure, pmd, addr, next);
>>   		if (err)
>>   			break;
>>   	} while (pmd++, addr = next, addr != end);
>>   	return err;
>>   }
>>   
>> -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
>> -				     unsigned long addr, unsigned long end,
>> -				     pte_fn_t fn, void *data)
>> +static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
>> +			      unsigned long addr, unsigned long end)
>>   {
>>   	pud_t *pud;
>>   	unsigned long next;
>> -	int err;
>> +	int err = 0;
>>   
>> -	pud = pud_alloc(mm, p4d, addr);
>> +	pud = pud_alloc(closure->mm, p4d, addr);
>>   	if (!pud)
>>   		return -ENOMEM;
>> +
>>   	do {
>>   		next = pud_addr_end(addr, end);
>> -		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
>> +		if (!closure->alloc && pud_none_or_clear_bad(pud))
>> +			continue;
>> +		err = apply_to_pmd_range(closure, pud, addr, next);
>>   		if (err)
>>   			break;
>>   	} while (pud++, addr = next, addr != end);
>>   	return err;
>>   }
>>   
>> -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
>> -				     unsigned long addr, unsigned long end,
>> -				     pte_fn_t fn, void *data)
>> +static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
>> +			      unsigned long addr, unsigned long end)
>>   {
>>   	p4d_t *p4d;
>>   	unsigned long next;
>> -	int err;
>> +	int err = 0;
>>   
>> -	p4d = p4d_alloc(mm, pgd, addr);
>> +	p4d = p4d_alloc(closure->mm, pgd, addr);
>>   	if (!p4d)
>>   		return -ENOMEM;
>> +
>>   	do {
>>   		next = p4d_addr_end(addr, end);
>> -		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
>> +		if (!closure->alloc && p4d_none_or_clear_bad(p4d))
>> +			continue;
>> +		err = apply_to_pud_range(closure, p4d, addr, next);
>>   		if (err)
>>   			break;
>>   	} while (p4d++, addr = next, addr != end);
>>   	return err;
>>   }
>>   
>> -/*
>> - * Scan a region of virtual memory, filling in page tables as necessary
>> - * and calling a provided function on each leaf page table.
>> +/**
>> + * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
>> + * function on each leaf page table entry
>> + * @closure: Details about how to scan and what function to apply
>> + * @addr: Start virtual address
>> + * @size: Size of the region
>> + *
>> + * If @closure->alloc is set to 1, the function will fill in the page table
>> + * as necessary. Otherwise it will skip non-present parts.
>> + * Note: The caller must ensure that the range does not contain huge pages.
>> + * The caller must also assure that the proper mmu_notifier functions are
>> + * called before and after the call to apply_to_pfn_range.
>> + *
>> + * WARNING: Do not use this function unless you know exactly what you are
>> + * doing. It is lacking support for huge pages and transparent huge pages.
>> + *
>> + * Return: Zero on success. If the provided function returns a non-zero status,
>> + * the page table walk will terminate and that status will be returned.
>> + * If @closure->alloc is set to 1, then this function may also return memory
>> + * allocation errors arising from allocating page table memory.
>>    */
>> -int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
>> -			unsigned long size, pte_fn_t fn, void *data)
>> +int apply_to_pfn_range(struct pfn_range_apply *closure,
>> +		       unsigned long addr, unsigned long size)
>>   {
>>   	pgd_t *pgd;
>>   	unsigned long next;
>> @@ -2143,16 +2165,65 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
>>   	if (WARN_ON(addr >= end))
>>   		return -EINVAL;
>>   
>> -	pgd = pgd_offset(mm, addr);
>> +	pgd = pgd_offset(closure->mm, addr);
>>   	do {
>>   		next = pgd_addr_end(addr, end);
>> -		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
>> +		if (!closure->alloc && pgd_none_or_clear_bad(pgd))
>> +			continue;
>> +		err = apply_to_p4d_range(closure, pgd, addr, next);
>>   		if (err)
>>   			break;
>>   	} while (pgd++, addr = next, addr != end);
>>   
>>   	return err;
>>   }
>> +
>> +/**
>> + * struct page_range_apply - Closure structure for apply_to_page_range()
>> + * @pter: The base closure structure we derive from
>> + * @fn: The leaf pte function to call
>> + * @data: The leaf pte function closure
>> + */
>> +struct page_range_apply {
>> +	struct pfn_range_apply pter;
>> +	pte_fn_t fn;
>> +	void *data;
>> +};
>> +
>> +/*
>> + * Callback wrapper to enable use of apply_to_pfn_range for
>> + * the apply_to_page_range interface
>> + */
>> +static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
>> +				       unsigned long addr,
>> +				       struct pfn_range_apply *pter)
>> +{
>> +	struct page_range_apply *pra =
>> +		container_of(pter, typeof(*pra), pter);
>> +
>> +	return pra->fn(pte, token, addr, pra->data);
>> +}
>> +
>> +/*
>> + * Scan a region of virtual memory, filling in page tables as necessary
>> + * and calling a provided function on each leaf page table.
>> + *
>> + * WARNING: Do not use this function unless you know exactly what you are
>> + * doing. It is lacking support for huge pages and transparent huge pages.
>> + */
>> +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
>> +			unsigned long size, pte_fn_t fn, void *data)
>> +{
>> +	struct page_range_apply pra = {
>> +		.pter = {.mm = mm,
>> +			 .alloc = 1,
>> +			 .ptefn = apply_to_page_range_wrapper },
>> +		.fn = fn,
>> +		.data = data
>> +	};
>> +
>> +	return apply_to_pfn_range(&pra.pter, addr, size);
>> +}
>>   
>>   EXPORT_SYMBOL_GPL(apply_to_page_range);
> Actually - did you look into converting our two hand full of
> apply_to_page_range callers to your new scheme?  It seems like that
> might actually not be to bad and avoid various layers of wrappers.

Yes, I had that in mind once this landed and got some serious testing.

/Thomas





More information about the dri-devel mailing list