[RFC v1 1/3] mm/mmu_notifier: Add a new notifier for mapping updates (new pages)

Thu Jul 20 09:00:52 UTC 2023

"Kasireddy, Vivek" <vivek.kasireddy at intel.com> writes:

> Hi Alistair,
>
>> 
>> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> > index 64a3239b6407..1f2f0209101a 100644
>> > --- a/mm/hugetlb.c
>> > +++ b/mm/hugetlb.c
>> > @@ -6096,8 +6096,12 @@ vm_fault_t hugetlb_fault(struct mm_struct
>> *mm, struct vm_area_struct *vma,
>> >  		 * hugetlb_no_page will drop vma lock and hugetlb fault
>> >  		 * mutex internally, which make us return immediately.
>> >  		 */
>> > -		return hugetlb_no_page(mm, vma, mapping, idx, address,
>> ptep,
>> > +		ret = hugetlb_no_page(mm, vma, mapping, idx, address,
>> ptep,
>> >  				      entry, flags);
>> > +		if (!ret)
>> > +			mmu_notifier_update_mapping(vma->vm_mm,
>> address,
>> > +						    pte_pfn(*ptep));
>> 
>> The next patch ends up calling pfn_to_page() on the result of
>> pte_pfn(*ptep). I don't think that's safe because couldn't the PTE have
>> already changed and/or the new page have been freed?
> Yeah, that might be possible; I believe the right thing to do would be:
> -               return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
> +               ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
>                                       entry, flags);
> +               if (!ret) {
> +                       ptl = huge_pte_lock(h, mm, ptep);
> +                       mmu_notifier_update_mapping(vma->vm_mm, address,
> +                                                    pte_pfn(*ptep));
> +                       spin_unlock(ptl);
> +               }

Yes, although obviously as I think you point out below you wouldn't be
able to take any sleeping locks in mmu_notifier_update_mapping().

> In which case I'd need to make a similar change in the shmem path as well.
> And, also redo (or eliminate) the locking in udmabuf (patch) which seems a
> bit excessive on a second look given our use-case (where reads and writes do
> not happen simultaneously due to fence synchronization in the guest driver). 

I'm not at all familiar with the udmabuf use case but that sounds
brittle and effectively makes this notifier udmabuf specific right?

The name gives the impression it is more general though. I have
contemplated adding a notifier for PTE updates for drivers using
hmm_range_fault() as it would save some expensive device faults and it
this could be useful for that.

So if we're adding a notifier for PTE updates I think it would be good
if it covered all cases and was robust enough to allow mirroring of the
correct PTE value (ie. by being called under PTL or via some other
synchronisation like hmm_range_fault()).

Thanks.

> Thanks,
> Vivek
>
>> 
>> > +		return ret;
>> >
>> >  	ret = 0;
>> >
>> > @@ -6223,6 +6227,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm,
>> struct vm_area_struct *vma,
>> >  	 */
>> >  	if (need_wait_lock)
>> >  		folio_wait_locked(folio);
>> > +	if (!ret)
>> > +		mmu_notifier_update_mapping(vma->vm_mm, address,
>> > +					    pte_pfn(*ptep));
>> >  	return ret;
>> >  }
>> >
>> > diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
>> > index 50c0dde1354f..6421405334b9 100644
>> > --- a/mm/mmu_notifier.c
>> > +++ b/mm/mmu_notifier.c
>> > @@ -441,6 +441,23 @@ void __mmu_notifier_change_pte(struct
>> mm_struct *mm, unsigned long address,
>> >  	srcu_read_unlock(&srcu, id);
>> >  }
>> >
>> > +void __mmu_notifier_update_mapping(struct mm_struct *mm, unsigned
>> long address,
>> > +				   unsigned long pfn)
>> > +{
>> > +	struct mmu_notifier *subscription;
>> > +	int id;
>> > +
>> > +	id = srcu_read_lock(&srcu);
>> > +	hlist_for_each_entry_rcu(subscription,
>> > +				 &mm->notifier_subscriptions->list, hlist,
>> > +				 srcu_read_lock_held(&srcu)) {
>> > +		if (subscription->ops->update_mapping)
>> > +			subscription->ops->update_mapping(subscription,
>> mm,
>> > +							  address, pfn);
>> > +	}
>> > +	srcu_read_unlock(&srcu, id);
>> > +}
>> > +
>> >  static int mn_itree_invalidate(struct mmu_notifier_subscriptions
>> *subscriptions,
>> >  			       const struct mmu_notifier_range *range)
>> >  {
>> > diff --git a/mm/shmem.c b/mm/shmem.c
>> > index 2f2e0e618072..e59eb5fafadb 100644
>> > --- a/mm/shmem.c
>> > +++ b/mm/shmem.c
>> > @@ -77,6 +77,7 @@ static struct vfsmount *shm_mnt;
>> >  #include <linux/fcntl.h>
>> >  #include <uapi/linux/memfd.h>
>> >  #include <linux/rmap.h>
>> > +#include <linux/mmu_notifier.h>
>> >  #include <linux/uuid.h>
>> >
>> >  #include <linux/uaccess.h>
>> > @@ -2164,8 +2165,12 @@ static vm_fault_t shmem_fault(struct vm_fault
>> *vmf)
>> >  				  gfp, vma, vmf, &ret);
>> >  	if (err)
>> >  		return vmf_error(err);
>> > -	if (folio)
>> > +	if (folio) {
>> >  		vmf->page = folio_file_page(folio, vmf->pgoff);
>> > +		if (ret == VM_FAULT_LOCKED)
>> > +			mmu_notifier_update_mapping(vma->vm_mm, vmf-
>> >address,
>> > +						    page_to_pfn(vmf->page));
>> > +	}
>> >  	return ret;
>> >  }