[Intel-gfx] [PATCH v4] drm/i915: Implement inter-engine read-read optimisations

Thu Mar 26 14:31:40 PDT 2015

On Thu, Mar 26, 2015 at 05:43:33PM +0000, Tvrtko Ursulin wrote:
> >-static int
> >-i915_gem_object_wait_rendering__tail(struct drm_i915_gem_object *obj)
> >-{
> >-	if (!obj->active)
> >-		return 0;
> >-
> >-	/* Manually manage the write flush as we may have not yet
> >-	 * retired the buffer.
> >-	 *
> >-	 * Note that the last_write_req is always the earlier of
> >-	 * the two (read/write) requests, so if we haved successfully waited,
> >-	 * we know we have passed the last write.
> >-	 */
> >-	i915_gem_request_assign(&obj->last_write_req, NULL);
> >-
> >-	return 0;
> >+	return __i915_wait_request(req, reset_counter,
> >+				   interruptible, NULL, NULL);
> >  }
> 
> Net code comments -/+ for this patch needs improvement. :) Above you
> deleted a chunk but below added nothing.

I did. It's not added here as this code is buggy.

> I think something high level is needed to explain the active lists
> and tracking etc.

GEM object domain management and eviction.

> >  /**
> >@@ -1405,18 +1381,39 @@ static __must_check int
> >  i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj,
> >  			       bool readonly)
> >  {
> >-	struct drm_i915_gem_request *req;
> >-	int ret;
> >+	int ret, i;
> >
> >-	req = readonly ? obj->last_write_req : obj->last_read_req;
> >-	if (!req)
> >+	if (!obj->active)
> >  		return 0;
> >
> >-	ret = i915_wait_request(req);
> >-	if (ret)
> >-		return ret;
> >+	if (readonly) {
> >+		if (obj->last_write_req != NULL) {
> >+			ret = i915_wait_request(obj->last_write_req);
> >+			if (ret)
> >+				return ret;
> >+
> >+			i = obj->last_write_req->ring->id;
> >+			if (obj->last_read_req[i] == obj->last_write_req)
> >+				i915_gem_object_retire__read(obj, i);
> >+			else
> >+				i915_gem_object_retire__write(obj);
> 
> Above mentioned comments would especially help understand the
> ordering rules here.
> 
> >+		}
> >+	} else {
> >+		for (i = 0; i < I915_NUM_RINGS; i++) {
> >+			if (obj->last_read_req[i] == NULL)
> >+				continue;
> >+
> >+			ret = i915_wait_request(obj->last_read_req[i]);
> >+			if (ret)
> >+				return ret;
> >+
> >+			i915_gem_object_retire__read(obj, i);
> >+		}
> 
> I think with optimistic spinning this could end up doing num_rings
> spins etc in sequence. Would it be worth smarting it up somehow?

Hmm. Good point. Obviously the goal of the optimisation is to have more
opportunities where we never have to wait at at all. Writing a new
i915_wait_requests() will add a lot of code, definitely something I want
to postpone. But given the sequential wait, later ones are more likely
to already be complete.

> >+		BUG_ON(obj->active);
> 
> Better WARN and halt the driver indirectly if unavoidable.

It's a debug leftover, it's gone.

> >+	}
> >+
> >+	return 0;
> >
> >-	return i915_gem_object_wait_rendering__tail(obj);
> >  }
> >
> >  /* A nonblocking variant of the above wait. This is a highly dangerous routine
> >@@ -1427,37 +1424,71 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
> >  					    struct drm_i915_file_private *file_priv,
> >  					    bool readonly)
> >  {
> >-	struct drm_i915_gem_request *req;
> >  	struct drm_device *dev = obj->base.dev;
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >+	struct drm_i915_gem_request *requests[I915_NUM_RINGS];
> >  	unsigned reset_counter;
> >-	int ret;
> >+	int ret, i, n = 0;
> >
> >  	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
> >  	BUG_ON(!dev_priv->mm.interruptible);
> >
> >-	req = readonly ? obj->last_write_req : obj->last_read_req;
> >-	if (!req)
> >+	if (!obj->active)
> >  		return 0;
> >
> >  	ret = i915_gem_check_wedge(&dev_priv->gpu_error, true);
> >  	if (ret)
> >  		return ret;
> >
> >-	ret = i915_gem_check_olr(req);
> >-	if (ret)
> >-		return ret;
> >-
> >  	reset_counter = atomic_read(&dev_priv->gpu_error.reset_counter);
> >-	i915_gem_request_reference(req);
> 
> Good place for a comment: Build an array of requests to be waited upon etc..

Code comments need to explain why.

> >+
> >+	if (readonly) {
> >+		struct drm_i915_gem_request *rq;
> >+
> >+		rq = obj->last_write_req;
> >+		if (rq == NULL)
> >+			return 0;
> >+
> >+		ret = i915_gem_check_olr(rq);
> >+		if (ret)
> >+			goto err;
> >+
> >+		requests[n++] = i915_gem_request_reference(rq);
> >+	} else {
> >+		for (i = 0; i < I915_NUM_RINGS; i++) {
> >+			struct drm_i915_gem_request *rq;
> >+
> >+			rq = obj->last_read_req[i];
> >+			if (rq == NULL)
> >+				continue;
> >+
> >+			ret = i915_gem_check_olr(rq);
> >+			if (ret)
> >+				goto err;
> >+
> >+			requests[n++] = i915_gem_request_reference(rq);
> >+		}
> >+	}
> 
> I wonder if merging the tracked requests to a single array, roughly
> something like:
> 
> 	obj->last_req[1 + NUM_RINGS]
> 
> and:
> 
> 	LAST_WRITE_REQ = 0
> 	LAST_READ_REQ(ring) = 1 + ring
> 
> Could make the above logic more compact and how would it look
> elsewhere in the code? Definitely looks like it would work for the
> above loop:
> 
> 	for (i = LAST_WRITE_REQ; i <= LAST_TRACKED_REQUEST; i++) {
> 		rq = obj->last_req[i];
> 		if (rq == NULL && readonly)
> 			return 0;
> 		else if (rq == NULL)
> 			continue;
> 		...
> 		requests[n++] = ...
> 		if (readonly)
> 			break;
> 	}
> 
> Not sure, might not be that great idea.

Nah. I think it's only a win here. Elsewhere there is greater
diferrentiation between read/write. Conceptually it is even

	struct {
		struct drm_i915_gem_request *request;
		struct list_head active_link;
	} read[I915_NUM_RINGS];
	struct drm_i915_gem_request *write_request, *fence_request;

> >  	mutex_unlock(&dev->struct_mutex);
> >-	ret = __i915_wait_request(req, reset_counter, true, NULL, file_priv);
> >+	for (i = 0; ret == 0 && i < n; i++)
> >+		ret = __i915_wait_request(requests[i], reset_counter, true,
> >+					  NULL, file_priv);
> 
> Another chance for more optimal "group" waiting.
> 
> >  	mutex_lock(&dev->struct_mutex);
> >-	i915_gem_request_unreference(req);
> >-	if (ret)
> >-		return ret;
> >
> >-	return i915_gem_object_wait_rendering__tail(obj);
> >+err:
> >+	for (i = 0; i < n; i++) {
> >+		if (ret == 0) {
> >+			int ring = requests[i]->ring->id;
> >+			if (obj->last_read_req[ring] == requests[i])
> >+				i915_gem_object_retire__read(obj, ring);
> >+			if (obj->last_write_req == requests[i])
> >+				i915_gem_object_retire__write(obj);
> >+		}
> 
> What if one wait succeeded and one failed, no need to update anything?

Too much complication for an error path. This just aims to optimise the
bookkeeping for an object after a wait.

> >-static void
> >-i915_gem_object_move_to_active(struct drm_i915_gem_object *obj,
> >-			       struct intel_engine_cs *ring)
> >+void i915_vma_move_to_active(struct i915_vma *vma,
> >+			     struct intel_engine_cs *ring)
> >  {
> >-	struct drm_i915_gem_request *req;
> >-	struct intel_engine_cs *old_ring;
> >-
> >-	BUG_ON(ring == NULL);
> >-
> >-	req = intel_ring_get_request(ring);
> >-	old_ring = i915_gem_request_get_ring(obj->last_read_req);
> >-
> >-	if (old_ring != ring && obj->last_write_req) {
> >-		/* Keep the request relative to the current ring */
> >-		i915_gem_request_assign(&obj->last_write_req, req);
> >-	}
> >+	struct drm_i915_gem_object *obj = vma->obj;
> >
> >  	/* Add a reference if we're newly entering the active list. */
> >-	if (!obj->active) {
> >+	if (obj->last_read_req[ring->id] == NULL && obj->active++ == 0)
> >  		drm_gem_object_reference(&obj->base);
> >-		obj->active = 1;
> >-	}
> >+	BUG_ON(obj->active == 0 || obj->active > I915_NUM_RINGS);
> 
> Maybe we need I915_WARN_AND_HALT() which would be a WARN() plus put
> the driver in halted state?

This just magically evaporates by moving over to an obj->active
bitfield. You'll love v5, but hate v6 (which abuses a lots more internal
knowledge of request management).

> >+	if (to == NULL) {
> >+		ret = i915_gem_object_wait_rendering(obj, readonly);
> >+	} else if (readonly) {
> >+		ret = __i915_gem_object_sync(obj, to,
> >+					     obj->last_write_req);
> >+	} else {
> >+		for (i = 0; i < I915_NUM_RINGS; i++) {
> >+			ret = __i915_gem_object_sync(obj, to,
> >+						     obj->last_read_req[i]);
> 
> Here I think is another opportunity to wait for all of them at once.
> Via a __i915_gem_object_sync helper which would take an array, or a
> ring/request mask. Not sure if it would be worth it though.

No. This is more complicated still since we have the semaphores to also
deal will. Definitely worth waiting for a testcase.

> >-	args->busy = obj->active;
> >-	if (obj->last_read_req) {
> >-		struct intel_engine_cs *ring;
> >  		BUILD_BUG_ON(I915_NUM_RINGS > 16);
> >-		ring = i915_gem_request_get_ring(obj->last_read_req);
> >-		args->busy |= intel_ring_flag(ring) << 16;
> >+
> >+		for (i = 0; i < I915_NUM_RINGS; i++) {
> >+			if (obj->last_read_req[i] == NULL)
> >+				continue;
> >+
> >+			args->busy |= 1 << (16 + i) | 1;
> 
> Doesn't look like equivalent bits will be set? What is the "| 1" at
> the end for?

No. It's designed for. This is what userspaces expects and is required
to help workaround #70764.

I want to replace the (| 1) with
(| intel_ring_flag(obj->last_write_request->ring); but it exists because
we couldn't be sure if any userspace depended on exactly (0, 1).

> >diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> >index 0efb19a9b9a5..1a3756e6bea4 100644
> >--- a/drivers/gpu/drm/i915/intel_display.c
> >+++ b/drivers/gpu/drm/i915/intel_display.c
> >@@ -9674,7 +9674,7 @@ static bool use_mmio_flip(struct intel_engine_cs *ring,
> >  	else if (i915.enable_execlists)
> >  		return true;
> >  	else
> >-		return ring != i915_gem_request_get_ring(obj->last_read_req);
> >+		return ring != i915_gem_request_get_ring(obj->last_write_req);
> >  }
> 
> Why this?

Because that is correct.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre