[PATCH hmm 02/15] mm/mmu_notifier: add an interval tree notifier

Mon Oct 21 18:30:56 UTC 2019

On Tue, Oct 15, 2019 at 03:12:29PM -0300, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <jgg at mellanox.com>
> 
> Of the 13 users of mmu_notifiers, 8 of them use only
> invalidate_range_start/end() and immediately intersect the
> mmu_notifier_range with some kind of internal list of VAs.  4 use an
> interval tree (i915_gem, radeon_mn, umem_odp, hfi1). 4 use a linked list
> of some kind (scif_dma, vhost, gntdev, hmm)
> 
> And the remaining 5 either don't use invalidate_range_start() or do some
> special thing with it.
> 
> It turns out that building a correct scheme with an interval tree is
> pretty complicated, particularly if the use case is synchronizing against
> another thread doing get_user_pages().  Many of these implementations have
> various subtle and difficult to fix races.
> 
> This approach puts the interval tree as common code at the top of the mmu
> notifier call tree and implements a shareable locking scheme.
> 
> It includes:
>  - An interval tree tracking VA ranges, with per-range callbacks
>  - A read/write locking scheme for the interval tree that avoids
>    sleeping in the notifier path (for OOM killer)
>  - A sequence counter based collision-retry locking scheme to tell
>    device page fault that a VA range is being concurrently invalidated.
> 
> This is based on various ideas:
> - hmm accumulates invalidated VA ranges and releases them when all
>   invalidates are done, via active_invalidate_ranges count.
>   This approach avoids having to intersect the interval tree twice (as
>   umem_odp does) at the potential cost of a longer device page fault.
> 
> - kvm/umem_odp use a sequence counter to drive the collision retry,
>   via invalidate_seq
> 
> - a deferred work todo list on unlock scheme like RTNL, via deferred_list.
>   This makes adding/removing interval tree members more deterministic
> 
> - seqlock, except this version makes the seqlock idea multi-holder on the
>   write side by protecting it with active_invalidate_ranges and a spinlock
> 
> To minimize MM overhead when only the interval tree is being used, the
> entire SRCU and hlist overheads are dropped using some simple
> branches. Similarly the interval tree overhead is dropped when in hlist
> mode.
> 
> The overhead from the mandatory spinlock is broadly the same as most of
> existing users which already had a lock (or two) of some sort on the
> invalidation path.
> 
> Cc: Andrea Arcangeli <aarcange at redhat.com>
> Cc: Michal Hocko <mhocko at kernel.org>
> Signed-off-by: Jason Gunthorpe <jgg at mellanox.com>
> ---
>  include/linux/mmu_notifier.h |  78 ++++++
>  mm/Kconfig                   |   1 +
>  mm/mmu_notifier.c            | 529 +++++++++++++++++++++++++++++++++--
>  3 files changed, 583 insertions(+), 25 deletions(-)
> 
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index 12bd603d318ce7..bc2b12483de127 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -6,10 +6,12 @@
>  #include <linux/spinlock.h>
>  #include <linux/mm_types.h>
>  #include <linux/srcu.h>
> +#include <linux/interval_tree.h>
>  
>  struct mmu_notifier_mm;
>  struct mmu_notifier;
>  struct mmu_notifier_range;
> +struct mmu_range_notifier;
>  
>  /**
>   * enum mmu_notifier_event - reason for the mmu notifier callback
> @@ -32,6 +34,9 @@ struct mmu_notifier_range;
>   * access flags). User should soft dirty the page in the end callback to make
>   * sure that anyone relying on soft dirtyness catch pages that might be written
>   * through non CPU mappings.
> + *
> + * @MMU_NOTIFY_RELEASE: used during mmu_range_notifier invalidate to signal that
> + * the mm refcount is zero and the range is no longer accessible.
>   */
>  enum mmu_notifier_event {
>  	MMU_NOTIFY_UNMAP = 0,
> @@ -39,6 +44,7 @@ enum mmu_notifier_event {
>  	MMU_NOTIFY_PROTECTION_VMA,
>  	MMU_NOTIFY_PROTECTION_PAGE,
>  	MMU_NOTIFY_SOFT_DIRTY,
> +	MMU_NOTIFY_RELEASE,
>  };
>  
>  #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
> @@ -222,6 +228,25 @@ struct mmu_notifier {
>  	unsigned int users;
>  };
>  
> +/**
> + * struct mmu_range_notifier_ops
> + * @invalidate: Upon return the caller must stop using any SPTEs within this
> + *              range, this function can sleep. Return false if blocking was
> + *              required but range is non-blocking
> + */
> +struct mmu_range_notifier_ops {
> +	bool (*invalidate)(struct mmu_range_notifier *mrn,
> +			   const struct mmu_notifier_range *range);
> +};
> +
> +struct mmu_range_notifier {
> +	struct interval_tree_node interval_tree;
> +	const struct mmu_range_notifier_ops *ops;
> +	struct hlist_node deferred_item;
> +	unsigned long invalidate_seq;
> +	struct mm_struct *mm;
> +};
> +
>  #ifdef CONFIG_MMU_NOTIFIER
>  
>  #ifdef CONFIG_LOCKDEP
> @@ -263,6 +288,59 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn,
>  				   struct mm_struct *mm);
>  extern void mmu_notifier_unregister(struct mmu_notifier *mn,
>  				    struct mm_struct *mm);
> +
> +unsigned long mmu_range_read_begin(struct mmu_range_notifier *mrn);
> +int mmu_range_notifier_insert(struct mmu_range_notifier *mrn,
> +			      unsigned long start, unsigned long length,
> +			      struct mm_struct *mm);
> +int mmu_range_notifier_insert_locked(struct mmu_range_notifier *mrn,
> +				     unsigned long start, unsigned long length,
> +				     struct mm_struct *mm);
> +void mmu_range_notifier_remove(struct mmu_range_notifier *mrn);
> +
> +/**
> + * mmu_range_read_retry - End a read side critical section against a VA range
> + * mrn: The range under lock
> + * seq: The return of the paired mmu_range_read_begin()
> + *
> + * This MUST be called under a user provided lock that is also held
> + * unconditionally by op->invalidate(). That lock provides the required SMP
> + * barrier for handling invalidate_seq.
> + *
> + * Each call should be paired with a single mmu_range_read_begin() and
> + * should be used to conclude the read side.
> + *
> + * Returns true if an invalidation collided with this critical section, and
> + * the caller should retry.
> + */
> +static inline bool mmu_range_read_retry(struct mmu_range_notifier *mrn,
> +					unsigned long seq)
> +{
> +	return READ_ONCE(mrn->invalidate_seq) != seq;
> +}

What about calling this mmu_range_read_end() instead ? To match
with the mmu_range_read_begin().

> +
> +/**
> + * mmu_range_check_retry - Test if a collision has occurred
> + * mrn: The range under lock
> + * seq: The return of the matching mmu_range_read_begin()
> + *
> + * This can be used in the critical section between mmu_range_read_begin() and
> + * mmu_range_read_retry().  A return of true indicates an invalidation has
> + * collided with this lock and a future mmu_range_read_retry() will return
> + * true.
> + *
> + * False is not reliable and only suggests a collision has not happened. It
> + * can be called many times and does not have to hold the user provided lock.
> + *
> + * This call can be used as part of loops and other expensive operations to
> + * expedite a retry.
> + */
> +static inline bool mmu_range_check_retry(struct mmu_range_notifier *mrn,
> +					 unsigned long seq)
> +{
> +	return READ_ONCE(mrn->invalidate_seq) != seq;
> +}
> +
>  extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
>  extern void __mmu_notifier_release(struct mm_struct *mm);
>  extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
> diff --git a/mm/Kconfig b/mm/Kconfig
> index a5dae9a7eb510a..d0b5046d9aeffd 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -284,6 +284,7 @@ config VIRT_TO_BUS
>  config MMU_NOTIFIER
>  	bool
>  	select SRCU
> +	select INTERVAL_TREE
>  
>  config KSM
>  	bool "Enable KSM for page merging"
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> index 367670cfd02b7b..5e5e75ebcde4af 100644
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -12,6 +12,7 @@
>  #include <linux/export.h>
>  #include <linux/mm.h>
>  #include <linux/err.h>
> +#include <linux/interval_tree.h>
>  #include <linux/srcu.h>
>  #include <linux/rcupdate.h>
>  #include <linux/sched.h>
> @@ -36,10 +37,243 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
>  struct mmu_notifier_mm {
>  	/* all mmu notifiers registered in this mm are queued in this list */
>  	struct hlist_head list;
> +	bool has_interval;
>  	/* to serialize the list modifications and hlist_unhashed */
>  	spinlock_t lock;
> +	unsigned long invalidate_seq;
> +	unsigned long active_invalidate_ranges;
> +	struct rb_root_cached itree;
> +	wait_queue_head_t wq;
> +	struct hlist_head deferred_list;
>  };
>  
> +/*
> + * This is a collision-retry read-side/write-side 'lock', a lot like a
> + * seqcount, however this allows multiple write-sides to hold it at
> + * once. Conceptually the write side is protecting the values of the PTEs in
> + * this mm, such that PTES cannot be read into SPTEs while any writer exists.
> + *
> + * Note that the core mm creates nested invalidate_range_start()/end() regions
> + * within the same thread, and runs invalidate_range_start()/end() in parallel
> + * on multiple CPUs. This is designed to not reduce concurrency or block
> + * progress on the mm side.
> + *
> + * As a secondary function, holding the full write side also serves to prevent
> + * writers for the itree, this is an optimization to avoid extra locking
> + * during invalidate_range_start/end notifiers.
> + *
> + * The write side has two states, fully excluded:
> + *  - mm->active_invalidate_ranges != 0
> + *  - mnn->invalidate_seq & 1 == True
> + *  - some range on the mm_struct is being invalidated
> + *  - the itree is not allowed to change
> + *
> + * And partially excluded:
> + *  - mm->active_invalidate_ranges != 0
> + *  - some range on the mm_struct is being invalidated
> + *  - the itree is allowed to change
> + *
> + * The later state avoids some expensive work on inv_end in the common case of
> + * no mrn monitoring the VA.
> + */
> +static bool mn_itree_is_invalidating(struct mmu_notifier_mm *mmn_mm)
> +{
> +	lockdep_assert_held(&mmn_mm->lock);
> +	return mmn_mm->invalidate_seq & 1;
> +}
> +
> +static struct mmu_range_notifier *
> +mn_itree_inv_start_range(struct mmu_notifier_mm *mmn_mm,
> +			 const struct mmu_notifier_range *range,
> +			 unsigned long *seq)
> +{
> +	struct interval_tree_node *node;
> +	struct mmu_range_notifier *res = NULL;
> +
> +	spin_lock(&mmn_mm->lock);
> +	mmn_mm->active_invalidate_ranges++;
> +	node = interval_tree_iter_first(&mmn_mm->itree, range->start,
> +					range->end - 1);
> +	if (node) {
> +		mmn_mm->invalidate_seq |= 1;
> +		res = container_of(node, struct mmu_range_notifier,
> +				   interval_tree);
> +	}
> +
> +	*seq = mmn_mm->invalidate_seq;
> +	spin_unlock(&mmn_mm->lock);
> +	return res;
> +}
> +
> +static struct mmu_range_notifier *
> +mn_itree_inv_next(struct mmu_range_notifier *mrn,
> +		  const struct mmu_notifier_range *range)
> +{
> +	struct interval_tree_node *node;
> +
> +	node = interval_tree_iter_next(&mrn->interval_tree, range->start,
> +				       range->end - 1);
> +	if (!node)
> +		return NULL;
> +	return container_of(node, struct mmu_range_notifier, interval_tree);
> +}
> +
> +static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm)
> +{
> +	struct mmu_range_notifier *mrn;
> +	struct hlist_node *next;
> +	bool need_wake = false;
> +
> +	spin_lock(&mmn_mm->lock);
> +	if (--mmn_mm->active_invalidate_ranges ||
> +	    !mn_itree_is_invalidating(mmn_mm)) {
> +		spin_unlock(&mmn_mm->lock);
> +		return;
> +	}
> +
> +	mmn_mm->invalidate_seq++;
> +	need_wake = true;
> +
> +	/*
> +	 * The inv_end incorporates a deferred mechanism like rtnl. Adds and

The rtnl reference is lost on people unfamiliar with the network :)
code maybe like rtnl_lock()/rtnl_unlock() so people have a chance to
grep the right function. Assuming i am myself getting the right
reference :)

> +	 * removes are queued until the final inv_end happens then they are
> +	 * progressed. This arrangement for tree updates is used to avoid
> +	 * using a blocking lock during invalidate_range_start.
> +	 */
> +	hlist_for_each_entry_safe(mrn, next, &mmn_mm->deferred_list,
> +				  deferred_item) {
> +		if (RB_EMPTY_NODE(&mrn->interval_tree.rb))
> +			interval_tree_insert(&mrn->interval_tree,
> +					     &mmn_mm->itree);
> +		else
> +			interval_tree_remove(&mrn->interval_tree,
> +					     &mmn_mm->itree);
> +		hlist_del(&mrn->deferred_item);
> +	}
> +	spin_unlock(&mmn_mm->lock);
> +
> +	/*
> +	 * TODO: Since we already have a spinlock above, this would be faster
> +	 * as wake_up_q
> +	 */
> +	if (need_wake)
> +		wake_up_all(&mmn_mm->wq);
> +}
> +
> +/**
> + * mmu_range_read_begin - Begin a read side critical section against a VA range
> + * mrn: The range to lock
> + *
> + * mmu_range_read_begin()/mmu_range_read_retry() implement a collision-retry
> + * locking scheme similar to seqcount for the VA range under mrn. If the mm
> + * invokes invalidation during the critical section then
> + * mmu_range_read_retry() will return true.
> + *
> + * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
> + * require a blocking context.  The critical region formed by this lock can
> + * sleep, and the required 'user_lock' can also be a sleeping lock.
> + *
> + * The caller is required to provide a 'user_lock' to serialize both teardown
> + * and setup.
> + *
> + * The return value should be passed to mmu_range_read_retry().
> + */
> +unsigned long mmu_range_read_begin(struct mmu_range_notifier *mrn)
> +{
> +	struct mmu_notifier_mm *mmn_mm = mrn->mm->mmu_notifier_mm;
> +	unsigned long seq;
> +	bool is_invalidating;
> +
> +	/*
> +	 * If the mrn has a different seq value under the user_lock than we
> +	 * started with then it has collided.
> +	 *
> +	 * If the mrn currently has the same seq value as the mmn_mm seq, then
> +	 * it is currently between invalidate_start/end and is colliding.
> +	 *
> +	 * The locking looks broadly like this:
> +	 *   mn_tree_invalidate_start():          mmu_range_read_begin():
> +	 *                                         spin_lock
> +	 *                                          seq = READ_ONCE(mrn->invalidate_seq);
> +	 *                                          seq == mmn_mm->invalidate_seq
> +	 *                                         spin_unlock
> +	 *    spin_lock
> +	 *     seq = ++mmn_mm->invalidate_seq
> +	 *    spin_unlock
> +	 *    mrn->invalidate_seq = seq
> +	 *     op->invalidate_range():
> +	 *       user_lock
> +	 *       user_unlock
> +	 *
> +	 *                          [Required: mmu_range_read_retry() == true]
> +	 *
> +	 *   mn_itree_inv_end():
> +	 *    spin_lock
> +	 *     seq = ++mmn_mm->invalidate_seq
> +	 *    spin_unlock
> +	 *
> +	 *                                        user_lock
> +	 *                                         mmu_range_read_retry():
> +	 *                                          READ_ONCE(mrn->invalidate_seq) != seq
> +	 *                                        user_unlock
> +	 *
> +	 * Logically mrn->invalidate_seq is locked under the user provided
> +	 * lock, however the write is placed before that lock due to the way
> +	 * the API is layered.
> +	 *
> +	 * Barriers are not needed as any races here are closed by an eventual
> +	 * mmu_range_read_retry(), which provides a barrier via the user_lock.
> +	 */
> +	spin_lock(&mmn_mm->lock);
> +	seq = READ_ONCE(mrn->invalidate_seq);
> +	is_invalidating = seq == mmn_mm->invalidate_seq;
> +	spin_unlock(&mmn_mm->lock);
> +
> +	/*
> +	 * mrn->invalidate_seq is always set to an odd value. This ensures
> +	 * that if seq does wrap we will always clear the below sleep in some
> +	 * reasonable time as mmn_mm->invalidate_seq is even in the idle
> +	 * state.

I think this comment should be with the struct mmu_range_notifier
definition and you should just point to it from here as the same
comment would be useful down below.

> +	 */
> +	lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
> +	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
> +	if (is_invalidating)
> +		wait_event(mmn_mm->wq,
> +			   READ_ONCE(mmn_mm->invalidate_seq) != seq);
> +
> +	/*
> +	 * Notice that mmu_range_read_retry() can already be true at this
> +	 * point, avoiding loops here allows the user of this lock to provide
> +	 * a global time bound.
> +	 */
> +
> +	return seq;
> +}
> +EXPORT_SYMBOL_GPL(mmu_range_read_begin);
> +
> +static void mn_itree_release(struct mmu_notifier_mm *mmn_mm,
> +			     struct mm_struct *mm)
> +{
> +	struct mmu_notifier_range range = {
> +		.flags = MMU_NOTIFIER_RANGE_BLOCKABLE,
> +		.event = MMU_NOTIFY_RELEASE,
> +		.mm = mm,
> +		.start = 0,
> +		.end = ULONG_MAX,
> +	};
> +	struct mmu_range_notifier *mrn;
> +	unsigned long cur_seq;
> +	bool ret;
> +
> +	for (mrn = mn_itree_inv_start_range(mmn_mm, &range, &cur_seq); mrn;
> +	     mrn = mn_itree_inv_next(mrn, &range)) {
> +		ret = mrn->ops->invalidate(mrn, &range);
> +		WARN_ON(ret);
> +	}
> +
> +	mn_itree_inv_end(mmn_mm);
> +}
> +
>  /*
>   * This function can't run concurrently against mmu_notifier_register
>   * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
> @@ -52,17 +286,24 @@ struct mmu_notifier_mm {
>   * can't go away from under us as exit_mmap holds an mm_count pin
>   * itself.
>   */
> -void __mmu_notifier_release(struct mm_struct *mm)
> +static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm,
> +			     struct mm_struct *mm)
>  {
>  	struct mmu_notifier *mn;
>  	int id;
>  
> +	if (mmn_mm->has_interval)
> +		mn_itree_release(mmn_mm, mm);
> +
> +	if (hlist_empty(&mmn_mm->list))
> +		return;
> +
>  	/*
>  	 * SRCU here will block mmu_notifier_unregister until
>  	 * ->release returns.
>  	 */
>  	id = srcu_read_lock(&srcu);
> -	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
> +	hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist)
>  		/*
>  		 * If ->release runs before mmu_notifier_unregister it must be
>  		 * handled, as it's the only way for the driver to flush all
> @@ -72,9 +313,9 @@ void __mmu_notifier_release(struct mm_struct *mm)
>  		if (mn->ops->release)
>  			mn->ops->release(mn, mm);
>  
> -	spin_lock(&mm->mmu_notifier_mm->lock);
> -	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
> -		mn = hlist_entry(mm->mmu_notifier_mm->list.first,
> +	spin_lock(&mmn_mm->lock);
> +	while (unlikely(!hlist_empty(&mmn_mm->list))) {
> +		mn = hlist_entry(mmn_mm->list.first,
>  				 struct mmu_notifier,
>  				 hlist);
>  		/*
> @@ -85,7 +326,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
>  		 */
>  		hlist_del_init_rcu(&mn->hlist);
>  	}
> -	spin_unlock(&mm->mmu_notifier_mm->lock);
> +	spin_unlock(&mmn_mm->lock);
>  	srcu_read_unlock(&srcu, id);
>  
>  	/*
> @@ -100,6 +341,17 @@ void __mmu_notifier_release(struct mm_struct *mm)
>  	synchronize_srcu(&srcu);
>  }
>  
> +void __mmu_notifier_release(struct mm_struct *mm)
> +{
> +	struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm;
> +
> +	if (mmn_mm->has_interval)
> +		mn_itree_release(mmn_mm, mm);
> +
> +	if (!hlist_empty(&mmn_mm->list))
> +		mn_hlist_release(mmn_mm, mm);
> +}
> +
>  /*
>   * If no young bitflag is supported by the hardware, ->clear_flush_young can
>   * unmap the address and return 1 or 0 depending if the mapping previously
> @@ -172,14 +424,41 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
>  	srcu_read_unlock(&srcu, id);
>  }
>  
> -int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
> +static int mn_itree_invalidate(struct mmu_notifier_mm *mmn_mm,
> +				     const struct mmu_notifier_range *range)
> +{
> +	struct mmu_range_notifier *mrn;
> +	unsigned long cur_seq;
> +
> +	for (mrn = mn_itree_inv_start_range(mmn_mm, range, &cur_seq); mrn;
> +	     mrn = mn_itree_inv_next(mrn, range)) {
> +		bool ret;
> +
> +		WRITE_ONCE(mrn->invalidate_seq, cur_seq);
> +		ret = mrn->ops->invalidate(mrn, range);
> +		if (!ret && !WARN_ON(mmu_notifier_range_blockable(range)))

Isn't the logic wrong here ? We want to warn if the range
was mark as blockable and invalidate returned false. Also
we went to backoff no matter what if the invalidate return
false ie:
    if (!ret) {
        WARN_ON(mmu_notifier_range_blockable(range)))
        goto out_would_block;
    }

> +			goto out_would_block;
> +	}
> +	return 0;
> +
> +out_would_block:
> +	/*
> +	 * On -EAGAIN the non-blocking caller is not allowed to call
> +	 * invalidate_range_end()
> +	 */
> +	mn_itree_inv_end(mmn_mm);
> +	return -EAGAIN;
> +}
> +
> +static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm,
> +					   struct mmu_notifier_range *range)
>  {
>  	struct mmu_notifier *mn;
>  	int ret = 0;
>  	int id;
>  
>  	id = srcu_read_lock(&srcu);
> -	hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
> +	hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) {
>  		if (mn->ops->invalidate_range_start) {
>  			int _ret;
>  
> @@ -203,15 +482,30 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
>  	return ret;
>  }
>  
> -void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
> -					 bool only_end)
> +int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
> +{
> +	struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm;
> +	int ret = 0;
> +
> +	if (mmn_mm->has_interval) {
> +		ret = mn_itree_invalidate(mmn_mm, range);
> +		if (ret)
> +			return ret;
> +	}
> +	if (!hlist_empty(&mmn_mm->list))
> +		return mn_hlist_invalidate_range_start(mmn_mm, range);
> +	return 0;
> +}
> +
> +static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm,
> +				    struct mmu_notifier_range *range,
> +				    bool only_end)
>  {
>  	struct mmu_notifier *mn;
>  	int id;
>  
> -	lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
>  	id = srcu_read_lock(&srcu);
> -	hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
> +	hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) {
>  		/*
>  		 * Call invalidate_range here too to avoid the need for the
>  		 * subsystem of having to register an invalidate_range_end
> @@ -238,6 +532,19 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
>  		}
>  	}
>  	srcu_read_unlock(&srcu, id);
> +}
> +
> +void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
> +					 bool only_end)
> +{
> +	struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm;
> +
> +	lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
> +	if (mmn_mm->has_interval)
> +		mn_itree_inv_end(mmn_mm);
> +
> +	if (!hlist_empty(&mmn_mm->list))
> +		mn_hlist_invalidate_end(mmn_mm, range, only_end);
>  	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
>  }
>  
> @@ -256,8 +563,9 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>  }
>  
>  /*
> - * Same as mmu_notifier_register but here the caller must hold the
> - * mmap_sem in write mode.
> + * Same as mmu_notifier_register but here the caller must hold the mmap_sem in
> + * write mode. A NULL mn signals the notifier is being registered for itree
> + * mode.
>   */
>  int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
>  {
> @@ -274,9 +582,6 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
>  		fs_reclaim_release(GFP_KERNEL);
>  	}
>  
> -	mn->mm = mm;
> -	mn->users = 1;
> -
>  	if (!mm->mmu_notifier_mm) {
>  		/*
>  		 * kmalloc cannot be called under mm_take_all_locks(), but we
> @@ -284,21 +589,22 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
>  		 * the write side of the mmap_sem.
>  		 */
>  		mmu_notifier_mm =
> -			kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
> +			kzalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
>  		if (!mmu_notifier_mm)
>  			return -ENOMEM;
>  
>  		INIT_HLIST_HEAD(&mmu_notifier_mm->list);
>  		spin_lock_init(&mmu_notifier_mm->lock);
> +		mmu_notifier_mm->invalidate_seq = 2;

Why starting at 2 ?

> +		mmu_notifier_mm->itree = RB_ROOT_CACHED;
> +		init_waitqueue_head(&mmu_notifier_mm->wq);
> +		INIT_HLIST_HEAD(&mmu_notifier_mm->deferred_list);
>  	}
>  
>  	ret = mm_take_all_locks(mm);
>  	if (unlikely(ret))
>  		goto out_clean;
>  
> -	/* Pairs with the mmdrop in mmu_notifier_unregister_* */
> -	mmgrab(mm);
> -
>  	/*
>  	 * Serialize the update against mmu_notifier_unregister. A
>  	 * side note: mmu_notifier_release can't run concurrently with
> @@ -306,13 +612,26 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
>  	 * current->mm or explicitly with get_task_mm() or similar).
>  	 * We can't race against any other mmu notifier method either
>  	 * thanks to mm_take_all_locks().
> +	 *
> +	 * release semantics are provided for users not inside a lock covered
> +	 * by mm_take_all_locks(). acquire can only be used while holding the
> +	 * mmgrab or mmget, and is safe because once created the
> +	 * mmu_notififer_mm is not freed until the mm is destroyed.
>  	 */
>  	if (mmu_notifier_mm)
> -		mm->mmu_notifier_mm = mmu_notifier_mm;
> +		smp_store_release(&mm->mmu_notifier_mm, mmu_notifier_mm);

I do not understand why you need the release semantics here, we
are under the mmap_sem in write mode when we release it the lock
barrier will make sure anyone else sees the new mmu_notifier_mm

I fail to see the benefit or need for release/acquire semantics
here.

>  
> -	spin_lock(&mm->mmu_notifier_mm->lock);
> -	hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
> -	spin_unlock(&mm->mmu_notifier_mm->lock);
> +	if (mn) {
> +		/* Pairs with the mmdrop in mmu_notifier_unregister_* */
> +		mmgrab(mm);
> +		mn->mm = mm;
> +		mn->users = 1;
> +
> +		spin_lock(&mm->mmu_notifier_mm->lock);
> +		hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
> +		spin_unlock(&mm->mmu_notifier_mm->lock);
> +	} else
> +		mm->mmu_notifier_mm->has_interval = true;
>  
>  	mm_drop_all_locks(mm);
>  	BUG_ON(atomic_read(&mm->mm_users) <= 0);
> @@ -529,6 +848,166 @@ void mmu_notifier_put(struct mmu_notifier *mn)
>  }
>  EXPORT_SYMBOL_GPL(mmu_notifier_put);
>  
> +static int __mmu_range_notifier_insert(struct mmu_range_notifier *mrn,
> +				       unsigned long start,
> +				       unsigned long length,
> +				       struct mmu_notifier_mm *mmn_mm,
> +				       struct mm_struct *mm)
> +{
> +	mrn->mm = mm;
> +	RB_CLEAR_NODE(&mrn->interval_tree.rb);
> +	mrn->interval_tree.start = start;
> +	/*
> +	 * Note that the representation of the intervals in the interval tree
> +	 * considers the ending point as contained in the interval.
> +	 */
> +	if (length == 0 ||
> +	    check_add_overflow(start, length - 1, &mrn->interval_tree.last))
> +		return -EOVERFLOW;
> +
> +	/* pairs with mmdrop in mmu_range_notifier_remove() */
> +	mmgrab(mm);
> +
> +	/*
> +	 * If some invalidate_range_start/end region is going on in parallel
> +	 * we don't know what VA ranges are affected, so we must assume this
> +	 * new range is included.
> +	 *
> +	 * If the itree is invalidating then we are not allowed to change
> +	 * it. Retrying until invalidation is done is tricky due to the
> +	 * possibility for live lock, instead defer the add to the unlock so
> +	 * this algorithm is deterministic.
> +	 *
> +	 * In all cases the value for the mrn->mr_invalidate_seq should be
> +	 * odd, see mmu_range_read_begin()
> +	 */
> +	spin_lock(&mmn_mm->lock);
> +	if (mmn_mm->active_invalidate_ranges) {
> +		if (mn_itree_is_invalidating(mmn_mm))
> +			hlist_add_head(&mrn->deferred_item,
> +				       &mmn_mm->deferred_list);
> +		else {
> +			mmn_mm->invalidate_seq |= 1;
> +			interval_tree_insert(&mrn->interval_tree,
> +					     &mmn_mm->itree);
> +		}
> +		mrn->invalidate_seq = mmn_mm->invalidate_seq;
> +	} else {
> +		WARN_ON(mn_itree_is_invalidating(mmn_mm));
> +		mrn->invalidate_seq = mmn_mm->invalidate_seq - 1;
> +		interval_tree_insert(&mrn->interval_tree, &mmn_mm->itree);
> +	}
> +	spin_unlock(&mmn_mm->lock);
> +	return 0;
> +}
> +
> +/**
> + * mmu_range_notifier_insert - Insert a range notifier
> + * @mrn: Range notifier to register
> + * @start: Starting virtual address to monitor
> + * @length: Length of the range to monitor
> + * @mm : mm_struct to attach to
> + *
> + * This function subscribes the range notifier for notifications from the mm.
> + * Upon return the ops related to mmu_range_notifier will be called whenever
> + * an event that intersects with the given range occurs.
> + *
> + * Upon return the range_notifier may not be present in the interval tree yet.
> + * The caller must use the normal range notifier locking flow via
> + * mmu_range_read_begin() to establish SPTEs for this range.
> + */
> +int mmu_range_notifier_insert(struct mmu_range_notifier *mrn,
> +			      unsigned long start, unsigned long length,
> +			      struct mm_struct *mm)
> +{
> +	struct mmu_notifier_mm *mmn_mm;
> +	int ret;
> +
> +	might_lock(&mm->mmap_sem);
> +
> +	mmn_mm = smp_load_acquire(&mm->mmu_notifier_mm);
> +	if (!mmn_mm || !mmn_mm->has_interval) {
> +		ret = mmu_notifier_register(NULL, mm);
> +		if (ret)
> +			return ret;
> +		mmn_mm = mm->mmu_notifier_mm;
> +	}
> +	return __mmu_range_notifier_insert(mrn, start, length, mmn_mm, mm);
> +}
> +EXPORT_SYMBOL_GPL(mmu_range_notifier_insert);
> +
> +int mmu_range_notifier_insert_locked(struct mmu_range_notifier *mrn,
> +				     unsigned long start, unsigned long length,
> +				     struct mm_struct *mm)
> +{
> +	struct mmu_notifier_mm *mmn_mm;
> +	int ret;
> +
> +	lockdep_assert_held_write(&mm->mmap_sem);
> +
> +	mmn_mm = mm->mmu_notifier_mm;

Shouldn't you be using smp_load_acquire() ?

> +	if (!mmn_mm || !mmn_mm->has_interval) {
> +		ret = __mmu_notifier_register(NULL, mm);
> +		if (ret)
> +			return ret;
> +		mmn_mm = mm->mmu_notifier_mm;
> +	}
> +	return __mmu_range_notifier_insert(mrn, start, length, mmn_mm, mm);
> +}
> +EXPORT_SYMBOL_GPL(mmu_range_notifier_insert_locked);
> +
> +/**
> + * mmu_range_notifier_remove - Remove a range notifier
> + * @mrn: Range notifier to unregister
> + *
> + * This function must be paired with mmu_range_notifier_insert(). It cannot be
> + * called from any ops callback.
> + *
> + * Once this returns ops callbacks are no longer running on other CPUs and
> + * will not be called in future.
> + */
> +void mmu_range_notifier_remove(struct mmu_range_notifier *mrn)
> +{
> +	struct mm_struct *mm = mrn->mm;
> +	struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm;
> +	unsigned long seq = 0;
> +
> +	might_sleep();
> +
> +	spin_lock(&mmn_mm->lock);
> +	if (mn_itree_is_invalidating(mmn_mm)) {
> +		/*
> +		 * remove is being called after insert put this on the
> +		 * deferred list, but before the deferred list was processed.
> +		 */
> +		if (RB_EMPTY_NODE(&mrn->interval_tree.rb)) {
> +			hlist_del(&mrn->deferred_item);
> +		} else {
> +			hlist_add_head(&mrn->deferred_item,
> +				       &mmn_mm->deferred_list);
> +			seq = mmn_mm->invalidate_seq;
> +		}
> +	} else {
> +		WARN_ON(RB_EMPTY_NODE(&mrn->interval_tree.rb));
> +		interval_tree_remove(&mrn->interval_tree, &mmn_mm->itree);
> +	}
> +	spin_unlock(&mmn_mm->lock);
> +
> +	/*
> +	 * The possible sleep on progress in the invalidation requires the
> +	 * caller not hold any locks held by invalidation callbacks.
> +	 */
> +	lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
> +	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
> +	if (seq)
> +		wait_event(mmn_mm->wq,
> +			   READ_ONCE(mmn_mm->invalidate_seq) != seq);
> +
> +	/* pairs with mmgrab in mmu_range_notifier_insert() */
> +	mmdrop(mm);
> +}
> +EXPORT_SYMBOL_GPL(mmu_range_notifier_remove);
> +
>  /**
>   * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed
>   *
> -- 
> 2.23.0
>