[Intel-gfx] [PATCH] drm/i915: Avoid keeping waitboost active for signaling threads

Fri Jun 23 10:35:06 UTC 2017

On Thu, Jun 22, 2017 at 11:55:51AM +0100, Chris Wilson wrote:
> Once a client has requested a waitboost, we keep that waitboost active
> until all clients are no longer waiting. This is because we don't
> distinguish which waiter deserves the boost. However, with the advent of
> fence signaling, the signaler threads appear as waiters to the RPS
> interrupt handler. So instead of using a single boolean to track when to
> keep the waitboost active, use a counter of all outstanding waitboosted
> requests.
> 
> At this point, I have removed all vestiges of the rate limiting on
> clients. Whilst this means that compositors should remain more fluid,
> it also means that boosts are more prevalent.
> 
> A drawback of this implementation is that it requires constant request
> submission to keep the waitboost trimmed (as it is now cancelled when the
> request is completed). This will be fine for a busy system, but near
> idle the boosts may be kept for longer than desired (effectively tens of
> vblanks worstcase) and there is a reliance on rc6 instead.

In other words, now we're disabling boost when all requests that required boost
are retired.
We may need to tweak igt pm_rps boost scenarios to account for that extra time.

> 
> Reported-by: Michał Winiarski <michal.winiarski at intel.com>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Michał Winiarski <michal.winiarski at intel.com>

[SNIP]

> @@ -2347,11 +2349,10 @@ static int i915_rps_boost_info(struct seq_file *m, void *data)
>  
>  		rcu_read_lock();
>  		task = pid_task(file->pid, PIDTYPE_PID);
> -		seq_printf(m, "%s [%d]: %d boosts%s\n",
> +		seq_printf(m, "%s [%d]: %d boosts\n",
>  			   task ? task->comm : "<unknown>",
>  			   task ? task->pid : -1,
> -			   file_priv->rps.boosts,
> -			   list_empty(&file_priv->rps.link) ? "" : ", active");
> +			   atomic_read(&file_priv->rps.boosts));
>  		rcu_read_unlock();
>  	}
>  	seq_printf(m, "Kernel (anonymous) boosts: %d\n", dev_priv->rps.boosts);

dev_priv->rps.boosts seems to be unused at this point, could you remove it while
you're here?

With that:
Reviewed-by: Michał Winiarski <michal.winiarski at intel.com>

-Michał

> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 30e89456fc61..95e164387363 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -584,8 +584,7 @@ struct drm_i915_file_private {
>  	struct idr context_idr;
>  
>  	struct intel_rps_client {
> -		struct list_head link;
> -		unsigned boosts;
> +		atomic_t boosts;
>  	} rps;
>  
>  	unsigned int bsd_engine;
> @@ -1304,8 +1303,7 @@ struct intel_gen6_power_mgmt {
>  	enum { LOW_POWER, BETWEEN, HIGH_POWER } power;
>  
>  	spinlock_t client_lock;
> -	struct list_head clients;
> -	bool client_boost;
> +	atomic_t num_waiters;
>  
>  	bool enabled;
>  	struct delayed_work autoenable_work;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ae3ce1314bd1..7391e2d36a31 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -388,7 +388,7 @@ i915_gem_object_wait_fence(struct dma_fence *fence,
>  	 */
>  	if (rps) {
>  		if (INTEL_GEN(rq->i915) >= 6)
> -			gen6_rps_boost(rq->i915, rps, rq->emitted_jiffies);
> +			gen6_rps_boost(rq, rps);
>  		else
>  			rps = NULL;
>  	}
> @@ -399,22 +399,6 @@ i915_gem_object_wait_fence(struct dma_fence *fence,
>  	if (flags & I915_WAIT_LOCKED && i915_gem_request_completed(rq))
>  		i915_gem_request_retire_upto(rq);
>  
> -	if (rps && i915_gem_request_global_seqno(rq) == intel_engine_last_submit(rq->engine)) {
> -		/* The GPU is now idle and this client has stalled.
> -		 * Since no other client has submitted a request in the
> -		 * meantime, assume that this client is the only one
> -		 * supplying work to the GPU but is unable to keep that
> -		 * work supplied because it is waiting. Since the GPU is
> -		 * then never kept fully busy, RPS autoclocking will
> -		 * keep the clocks relatively low, causing further delays.
> -		 * Compensate by giving the synchronous client credit for
> -		 * a waitboost next time.
> -		 */
> -		spin_lock(&rq->i915->rps.client_lock);
> -		list_del_init(&rps->link);
> -		spin_unlock(&rq->i915->rps.client_lock);
> -	}
> -
>  	return timeout;
>  }
>  
> @@ -5065,12 +5049,6 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file)
>  	list_for_each_entry(request, &file_priv->mm.request_list, client_link)
>  		request->file_priv = NULL;
>  	spin_unlock(&file_priv->mm.lock);
> -
> -	if (!list_empty(&file_priv->rps.link)) {
> -		spin_lock(&to_i915(dev)->rps.client_lock);
> -		list_del(&file_priv->rps.link);
> -		spin_unlock(&to_i915(dev)->rps.client_lock);
> -	}
>  }
>  
>  int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
> @@ -5087,7 +5065,6 @@ int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
>  	file->driver_priv = file_priv;
>  	file_priv->dev_priv = i915;
>  	file_priv->file = file;
> -	INIT_LIST_HEAD(&file_priv->rps.link);
>  
>  	spin_lock_init(&file_priv->mm.lock);
>  	INIT_LIST_HEAD(&file_priv->mm.request_list);
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 8c59c79cbd8b..483af8921060 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -384,7 +384,11 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
>  		engine->context_unpin(engine, engine->last_retired_context);
>  	engine->last_retired_context = request->ctx;
>  
> -	dma_fence_signal(&request->fence);
> +	spin_lock_irq(&request->lock);
> +	if (request->waitboost)
> +		atomic_dec(&request->i915->rps.num_waiters);
> +	dma_fence_signal_locked(&request->fence);
> +	spin_unlock_irq(&request->lock);
>  
>  	i915_priotree_fini(request->i915, &request->priotree);
>  	i915_gem_request_put(request);
> @@ -639,6 +643,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>  	req->file_priv = NULL;
>  	req->batch = NULL;
>  	req->capture_list = NULL;
> +	req->waitboost = false;
>  
>  	/*
>  	 * Reserve space in the ring buffer for all the commands required to
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 7b7c84369d78..604e131470a1 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -184,6 +184,8 @@ struct drm_i915_gem_request {
>  	/** Time at which this request was emitted, in jiffies. */
>  	unsigned long emitted_jiffies;
>  
> +	bool waitboost;
> +
>  	/** engine->request_list entry for this request */
>  	struct list_head link;
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index b1c7d1a04612..61047ee2eede 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1091,18 +1091,6 @@ static u32 vlv_wa_c0_ei(struct drm_i915_private *dev_priv, u32 pm_iir)
>  	return events;
>  }
>  
> -static bool any_waiters(struct drm_i915_private *dev_priv)
> -{
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	for_each_engine(engine, dev_priv, id)
> -		if (intel_engine_has_waiter(engine))
> -			return true;
> -
> -	return false;
> -}
> -
>  static void gen6_pm_rps_work(struct work_struct *work)
>  {
>  	struct drm_i915_private *dev_priv =
> @@ -1114,7 +1102,7 @@ static void gen6_pm_rps_work(struct work_struct *work)
>  	spin_lock_irq(&dev_priv->irq_lock);
>  	if (dev_priv->rps.interrupts_enabled) {
>  		pm_iir = fetch_and_zero(&dev_priv->rps.pm_iir);
> -		client_boost = fetch_and_zero(&dev_priv->rps.client_boost);
> +		client_boost = atomic_read(&dev_priv->rps.num_waiters);
>  	}
>  	spin_unlock_irq(&dev_priv->irq_lock);
>  
> @@ -1131,7 +1119,7 @@ static void gen6_pm_rps_work(struct work_struct *work)
>  	new_delay = dev_priv->rps.cur_freq;
>  	min = dev_priv->rps.min_freq_softlimit;
>  	max = dev_priv->rps.max_freq_softlimit;
> -	if (client_boost || any_waiters(dev_priv))
> +	if (client_boost)
>  		max = dev_priv->rps.max_freq;
>  	if (client_boost && new_delay < dev_priv->rps.boost_freq) {
>  		new_delay = dev_priv->rps.boost_freq;
> @@ -1144,7 +1132,7 @@ static void gen6_pm_rps_work(struct work_struct *work)
>  
>  		if (new_delay >= dev_priv->rps.max_freq_softlimit)
>  			adj = 0;
> -	} else if (client_boost || any_waiters(dev_priv)) {
> +	} else if (client_boost) {
>  		adj = 0;
>  	} else if (pm_iir & GEN6_PM_RP_DOWN_TIMEOUT) {
>  		if (dev_priv->rps.cur_freq > dev_priv->rps.efficient_freq)
> diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
> index d93efb49a2e2..d17a32437f07 100644
> --- a/drivers/gpu/drm/i915/intel_drv.h
> +++ b/drivers/gpu/drm/i915/intel_drv.h
> @@ -1858,9 +1858,8 @@ void intel_suspend_gt_powersave(struct drm_i915_private *dev_priv);
>  void gen6_rps_busy(struct drm_i915_private *dev_priv);
>  void gen6_rps_reset_ei(struct drm_i915_private *dev_priv);
>  void gen6_rps_idle(struct drm_i915_private *dev_priv);
> -void gen6_rps_boost(struct drm_i915_private *dev_priv,
> -		    struct intel_rps_client *rps,
> -		    unsigned long submitted);
> +void gen6_rps_boost(struct drm_i915_gem_request *rq,
> +		    struct intel_rps_client *rps);
>  void intel_queue_rps_boost_for_request(struct drm_i915_gem_request *req);
>  void g4x_wm_get_hw_state(struct drm_device *dev);
>  void vlv_wm_get_hw_state(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index b5b7372fcddc..fabea61ca0b6 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -6125,47 +6125,36 @@ void gen6_rps_idle(struct drm_i915_private *dev_priv)
>  			   gen6_sanitize_rps_pm_mask(dev_priv, ~0));
>  	}
>  	mutex_unlock(&dev_priv->rps.hw_lock);
> -
> -	spin_lock(&dev_priv->rps.client_lock);
> -	while (!list_empty(&dev_priv->rps.clients))
> -		list_del_init(dev_priv->rps.clients.next);
> -	spin_unlock(&dev_priv->rps.client_lock);
>  }
>  
> -void gen6_rps_boost(struct drm_i915_private *dev_priv,
> -		    struct intel_rps_client *rps,
> -		    unsigned long submitted)
> +void gen6_rps_boost(struct drm_i915_gem_request *rq,
> +		    struct intel_rps_client *rps)
>  {
> +	struct drm_i915_private *i915 = rq->i915;
> +	bool boost;
> +
>  	/* This is intentionally racy! We peek at the state here, then
>  	 * validate inside the RPS worker.
>  	 */
> -	if (!(dev_priv->gt.awake &&
> -	      dev_priv->rps.enabled &&
> -	      dev_priv->rps.cur_freq < dev_priv->rps.boost_freq))
> +	if (!i915->rps.enabled)
>  		return;
>  
> -	/* Force a RPS boost (and don't count it against the client) if
> -	 * the GPU is severely congested.
> -	 */
> -	if (rps && time_after(jiffies, submitted + DRM_I915_THROTTLE_JIFFIES))
> -		rps = NULL;
> -
> -	spin_lock(&dev_priv->rps.client_lock);
> -	if (rps == NULL || list_empty(&rps->link)) {
> -		spin_lock_irq(&dev_priv->irq_lock);
> -		if (dev_priv->rps.interrupts_enabled) {
> -			dev_priv->rps.client_boost = true;
> -			schedule_work(&dev_priv->rps.work);
> -		}
> -		spin_unlock_irq(&dev_priv->irq_lock);
> -
> -		if (rps != NULL) {
> -			list_add(&rps->link, &dev_priv->rps.clients);
> -			rps->boosts++;
> -		} else
> -			dev_priv->rps.boosts++;
> +	boost = false;
> +	spin_lock_irq(&rq->lock);
> +	if (!rq->waitboost && !i915_gem_request_completed(rq)) {
> +		atomic_inc(&i915->rps.num_waiters);
> +		rq->waitboost = true;
> +		boost = true;
>  	}
> -	spin_unlock(&dev_priv->rps.client_lock);
> +	spin_unlock_irq(&rq->lock);
> +	if (!boost)
> +		return;
> +
> +	if (READ_ONCE(i915->rps.cur_freq) < i915->rps.boost_freq)
> +		schedule_work(&i915->rps.work);
> +
> +	if (rps != NULL)
> +		atomic_inc(&rps->boosts);
>  }
>  
>  int intel_set_rps(struct drm_i915_private *dev_priv, u8 val)
> @@ -9112,7 +9101,7 @@ static void __intel_rps_boost_work(struct work_struct *work)
>  	struct drm_i915_gem_request *req = boost->req;
>  
>  	if (!i915_gem_request_completed(req))
> -		gen6_rps_boost(req->i915, NULL, req->emitted_jiffies);
> +		gen6_rps_boost(req, NULL);
>  
>  	i915_gem_request_put(req);
>  	kfree(boost);
> @@ -9141,11 +9130,10 @@ void intel_queue_rps_boost_for_request(struct drm_i915_gem_request *req)
>  void intel_pm_setup(struct drm_i915_private *dev_priv)
>  {
>  	mutex_init(&dev_priv->rps.hw_lock);
> -	spin_lock_init(&dev_priv->rps.client_lock);
>  
>  	INIT_DELAYED_WORK(&dev_priv->rps.autoenable_work,
>  			  __intel_autoenable_gt_powersave);
> -	INIT_LIST_HEAD(&dev_priv->rps.clients);
> +	atomic_set(&dev_priv->rps.num_waiters, 0);
>  
>  	dev_priv->pm.suspended = false;
>  	atomic_set(&dev_priv->pm.wakeref_count, 0);
> -- 
> 2.11.0
>