[Intel-gfx] [PATCH 06/17] drm/i915: Simplify ELSP queue request tracking

Mon Aug 29 12:31:11 UTC 2016

Chris Wilson <chris at chris-wilson.co.uk> writes:

> Emulate HW to track and manage ELSP queue. A set of SW ports are defined
> and requests are assigned to these ports before submitting them to HW. This
> helps in cleaning up incomplete requests during reset recovery easier
> especially after engine reset by decoupling elsp queue management. This
> will become more clear in the next patch.
>
> In the engine reset case we want to resume where we left-off after skipping
> the incomplete batch which requires checking the elsp queue, removing
> element and fixing elsp_submitted counts in some cases. Instead of directly
> manipulating the elsp queue from reset path we can examine these ports, fix
> up ringbuffer pointers using the incomplete request and restart submissions
> again after reset.
>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> Cc: Arun Siluvery <arun.siluvery at linux.intel.com>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Link: http://patchwork.freedesktop.org/patch/msgid/1470414607-32453-3-git-send-email-arun.siluvery@linux.intel.com
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c     |   2 +-
>  drivers/gpu/drm/i915/i915_gem.c         |  13 +-
>  drivers/gpu/drm/i915/i915_gem_request.c |   1 -
>  drivers/gpu/drm/i915/i915_gem_request.h |  21 +-
>  drivers/gpu/drm/i915/intel_lrc.c        | 405 ++++++++++++++------------------
>  drivers/gpu/drm/i915/intel_lrc.h        |   2 -
>  drivers/gpu/drm/i915/intel_ringbuffer.h |   7 +-
>  7 files changed, 188 insertions(+), 263 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index d89359a50742..5f932ebc6ff1 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -2074,7 +2074,7 @@ static int i915_execlists(struct seq_file *m, void *data)
>  		status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(engine));
>  		seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer);
>  
> -		read_pointer = engine->next_context_status_buffer;
> +		read_pointer = GEN8_CSB_READ_PTR(status_pointer);
>  		write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
>  		if (read_pointer > write_pointer)
>  			write_pointer += GEN8_CSB_ENTRIES;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d37b44126942..838a275e7fac 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2575,6 +2575,9 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
>  	struct drm_i915_gem_request *request;
>  	struct intel_ring *ring;
>  
> +	/* Ensure irq handler finishes, and not run again. */
> +	tasklet_kill(&engine->irq_tasklet);
> +
>  	/* Mark all pending requests as complete so that any concurrent
>  	 * (lockless) lookup doesn't try and wait upon the request as we
>  	 * reset it.
> @@ -2588,10 +2591,12 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
>  	 */
>  
>  	if (i915.enable_execlists) {
> -		/* Ensure irq handler finishes or is cancelled. */
> -		tasklet_kill(&engine->irq_tasklet);
> -
> -		intel_execlists_cancel_requests(engine);
> +		spin_lock(&engine->execlist_lock);
> +		INIT_LIST_HEAD(&engine->execlist_queue);
> +		i915_gem_request_put(engine->execlist_port[0].request);
> +		i915_gem_request_put(engine->execlist_port[1].request);
> +		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> +		spin_unlock(&engine->execlist_lock);
>  	}
>  
>  	/*
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 9cc08a1e43c6..ec613fd5e01c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -402,7 +402,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>  	req->previous_context = NULL;
>  	req->file_priv = NULL;
>  	req->batch = NULL;
> -	req->elsp_submitted = 0;
>  
>  	/*
>  	 * Reserve space in the ring buffer for all the commands required to
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 2faa3bb4c39b..a231bd318ef0 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -137,27 +137,8 @@ struct drm_i915_gem_request {
>  	/** file_priv list entry for this request */
>  	struct list_head client_list;
>  
> -	/**
> -	 * The ELSP only accepts two elements at a time, so we queue
> -	 * context/tail pairs on a given queue (ring->execlist_queue) until the
> -	 * hardware is available. The queue serves a double purpose: we also use
> -	 * it to keep track of the up to 2 contexts currently in the hardware
> -	 * (usually one in execution and the other queued up by the GPU): We
> -	 * only remove elements from the head of the queue when the hardware
> -	 * informs us that an element has been completed.
> -	 *
> -	 * All accesses to the queue are mediated by a spinlock
> -	 * (ring->execlist_lock).
> -	 */
> -
> -	/** Execlist link in the submission queue.*/
> +	/** Link in the execlist submission queue, guarded by execlist_lock. */
>  	struct list_head execlist_link;
> -
> -	/** Execlists no. of times this request has been sent to the ELSP */
> -	int elsp_submitted;
> -
> -	/** Execlists context hardware id. */
> -	unsigned int ctx_hw_id;
>  };
>  
>  extern const struct fence_ops i915_fence_ops;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e9cb4a906009..864b5248279a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -156,6 +156,11 @@
>  #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
>  #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
>  
> +#define GEN8_CTX_STATUS_COMPLETED_MASK \
> +	 (GEN8_CTX_STATUS_ACTIVE_IDLE | \
> +	  GEN8_CTX_STATUS_PREEMPTED | \
> +	  GEN8_CTX_STATUS_ELEMENT_SWITCH)
> +
>  #define CTX_LRI_HEADER_0		0x01
>  #define CTX_CONTEXT_CONTROL		0x02
>  #define CTX_RING_HEAD			0x04
> @@ -263,12 +268,10 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_private *dev_priv = engine->i915;
>  
> -	if (IS_GEN8(dev_priv) || IS_GEN9(dev_priv))
> -		engine->idle_lite_restore_wa = ~0;
> -
> -	engine->disable_lite_restore_wa = (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
> -					IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) &&
> -					(engine->id == VCS || engine->id == VCS2);
> +	engine->disable_lite_restore_wa =
> +		(IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
> +		 IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) &&
> +		(engine->id == VCS || engine->id == VCS2);
>  
>  	engine->ctx_desc_template = GEN8_CTX_VALID;
>  	if (IS_GEN8(dev_priv))
> @@ -351,11 +354,11 @@ execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
>  	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
>  }
>  
> -static void execlists_update_context(struct drm_i915_gem_request *rq)
> +static u64 execlists_update_context(struct drm_i915_gem_request *rq)
>  {
> -	struct intel_engine_cs *engine = rq->engine;
> +	struct intel_context *ce = &rq->ctx->engine[rq->engine->id];
>  	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
> -	uint32_t *reg_state = rq->ctx->engine[engine->id].lrc_reg_state;
> +	u32 *reg_state = ce->lrc_reg_state;
>  
>  	reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail);
>  
> @@ -366,26 +369,34 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
>  	 */
>  	if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
>  		execlists_update_context_pdps(ppgtt, reg_state);
> +
> +	return ce->lrc_desc;
>  }
>  
> -static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> -				 struct drm_i915_gem_request *rq1)
> +static void execlists_submit_ports(struct intel_engine_cs *engine)
>  {
> -	struct intel_engine_cs *engine = rq0->engine;
> -	struct drm_i915_private *dev_priv = rq0->i915;
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	struct execlist_port *port = engine->execlist_port;
>  	u32 __iomem *elsp =
>  		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
>  	u64 desc[2];
>  
> -	if (rq1) {
> -		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
> -		rq1->elsp_submitted++;
> +	if (!port[0].count)
> +		execlists_context_status_change(port[0].request,
> +						INTEL_CONTEXT_SCHEDULE_IN);
> +	desc[0] = execlists_update_context(port[0].request);
> +	engine->preempt_wa = port[0].count++; /* bdw only? fixed on skl? */
> +

Experiments show that we need this on gen9 also, even tho some
documentation suggest that we don't.

This one patch took a lot of coffee.

Reviewed-by: Mika Kuoppala <mika.kuoppala at intel.com>

> +	if (port[1].request) {
> +		GEM_BUG_ON(port[1].count);
> +		execlists_context_status_change(port[1].request,
> +						INTEL_CONTEXT_SCHEDULE_IN);
> +		desc[1] = execlists_update_context(port[1].request);
> +		port[1].count = 1;
>  	} else {
>  		desc[1] = 0;
>  	}
> -
> -	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
> -	rq0->elsp_submitted++;
> +	GEM_BUG_ON(desc[0] == desc[1]);
>  
>  	/* You must always write both descriptors in the order below. */
>  	writel(upper_32_bits(desc[1]), elsp);
> @@ -396,141 +407,125 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>  	writel(lower_32_bits(desc[0]), elsp);
>  }
>  
> -static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
> -					   struct drm_i915_gem_request *rq1)
> +static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
>  {
> -	struct drm_i915_private *dev_priv = rq0->i915;
> -	unsigned int fw_domains = rq0->engine->fw_domains;
> -
> -	execlists_update_context(rq0);
> -
> -	if (rq1)
> -		execlists_update_context(rq1);
> +	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
> +		ctx->execlists_force_single_submission);
> +}
>  
> -	spin_lock_irq(&dev_priv->uncore.lock);
> -	intel_uncore_forcewake_get__locked(dev_priv, fw_domains);
> +static bool can_merge_ctx(const struct i915_gem_context *prev,
> +			  const struct i915_gem_context *next)
> +{
> +	if (prev != next)
> +		return false;
>  
> -	execlists_elsp_write(rq0, rq1);
> +	if (ctx_single_port_submission(prev))
> +		return false;
>  
> -	intel_uncore_forcewake_put__locked(dev_priv, fw_domains);
> -	spin_unlock_irq(&dev_priv->uncore.lock);
> +	return true;
>  }
>  
> -static void execlists_unqueue(struct intel_engine_cs *engine)
> +static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
> -	struct drm_i915_gem_request *cursor, *tmp;
> +	struct drm_i915_gem_request *cursor, *last;
> +	struct execlist_port *port = engine->execlist_port;
> +	bool submit = false;
> +
> +	last = port->request;
> +	if (last)
> +		/* WaIdleLiteRestore:bdw,skl
> +		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
> +		 * as we resubmit the request. See gen8_emit_request()
> +		 * for where we prepare the padding after the end of the
> +		 * request.
> +		 */
> +		last->tail = last->wa_tail;
>  
> -	assert_spin_locked(&engine->execlist_lock);
> +	GEM_BUG_ON(port[1].request);
>  
> -	/*
> -	 * If irqs are not active generate a warning as batches that finish
> -	 * without the irqs may get lost and a GPU Hang may occur.
> +	/* Hardware submission is through 2 ports. Conceptually each port
> +	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
> +	 * static for a context, and unique to each, so we only execute
> +	 * requests belonging to a single context from each ring. RING_HEAD
> +	 * is maintained by the CS in the context image, it marks the place
> +	 * where it got up to last time, and through RING_TAIL we tell the CS
> +	 * where we want to execute up to this time.
> +	 *
> +	 * In this list the requests are in order of execution. Consecutive
> +	 * requests from the same context are adjacent in the ringbuffer. We
> +	 * can combine these requests into a single RING_TAIL update:
> +	 *
> +	 *              RING_HEAD...req1...req2
> +	 *                                    ^- RING_TAIL
> +	 * since to execute req2 the CS must first execute req1.
> +	 *
> +	 * Our goal then is to point each port to the end of a consecutive
> +	 * sequence of requests as being the most optimal (fewest wake ups
> +	 * and context switches) submission.
>  	 */
> -	WARN_ON(!intel_irqs_enabled(engine->i915));
> -
> -	/* Try to read in pairs */
> -	list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue,
> -				 execlist_link) {
> -		if (!req0) {
> -			req0 = cursor;
> -		} else if (req0->ctx == cursor->ctx) {
> -			/* Same ctx: ignore first request, as second request
> -			 * will update tail past first request's workload */
> -			cursor->elsp_submitted = req0->elsp_submitted;
> -			list_del(&req0->execlist_link);
> -			i915_gem_request_put(req0);
> -			req0 = cursor;
> -		} else {
> -			if (IS_ENABLED(CONFIG_DRM_I915_GVT)) {
> -				/*
> -				 * req0 (after merged) ctx requires single
> -				 * submission, stop picking
> -				 */
> -				if (req0->ctx->execlists_force_single_submission)
> -					break;
> -				/*
> -				 * req0 ctx doesn't require single submission,
> -				 * but next req ctx requires, stop picking
> -				 */
> -				if (cursor->ctx->execlists_force_single_submission)
> -					break;
> -			}
> -			req1 = cursor;
> -			WARN_ON(req1->elsp_submitted);
> -			break;
> -		}
> -	}
>  
> -	if (unlikely(!req0))
> -		return;
> -
> -	execlists_context_status_change(req0, INTEL_CONTEXT_SCHEDULE_IN);
> -
> -	if (req1)
> -		execlists_context_status_change(req1,
> -						INTEL_CONTEXT_SCHEDULE_IN);
> -
> -	if (req0->elsp_submitted & engine->idle_lite_restore_wa) {
> -		/*
> -		 * WaIdleLiteRestore: make sure we never cause a lite restore
> -		 * with HEAD==TAIL.
> +	spin_lock(&engine->execlist_lock);
> +	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
> +		/* Can we combine this request with the current port? It has to
> +		 * be the same context/ringbuffer and not have any exceptions
> +		 * (e.g. GVT saying never to combine contexts).
>  		 *
> -		 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL as we
> -		 * resubmit the request. See gen8_emit_request() for where we
> -		 * prepare the padding after the end of the request.
> +		 * If we can combine the requests, we can execute both by
> +		 * updating the RING_TAIL to point to the end of the second
> +		 * request, and so we never need to tell the hardware about
> +		 * the first.
>  		 */
> -		req0->tail = req0->wa_tail;
> +		if (last && !can_merge_ctx(cursor->ctx, last->ctx)) {
> +			/* If we are on the second port and cannot combine
> +			 * this request with the last, then we are done.
> +			 */
> +			if (port != engine->execlist_port)
> +				break;
> +
> +			/* If GVT overrides us we only ever submit port[0],
> +			 * leaving port[1] empty. Note that we also have
> +			 * to be careful that we don't queue the same
> +			 * context (even though a different request) to
> +			 * the second port.
> +			 */
> +			if (ctx_single_port_submission(cursor->ctx))
> +				break;
> +
> +			GEM_BUG_ON(last->ctx == cursor->ctx);
> +
> +			i915_gem_request_assign(&port->request, last);
> +			port++;
> +		}
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		/* Decouple all the requests submitted from the queue */
> +		engine->execlist_queue.next = &cursor->execlist_link;
> +		cursor->execlist_link.prev = &engine->execlist_queue;
> +
> +		i915_gem_request_assign(&port->request, last);
>  	}
> +	spin_unlock(&engine->execlist_lock);
>  
> -	execlists_elsp_submit_contexts(req0, req1);
> +	if (submit)
> +		execlists_submit_ports(engine);
>  }
>  
> -static unsigned int
> -execlists_check_remove_request(struct intel_engine_cs *engine, u32 ctx_id)
> +static bool execlists_elsp_idle(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_gem_request *head_req;
> -
> -	assert_spin_locked(&engine->execlist_lock);
> -
> -	head_req = list_first_entry_or_null(&engine->execlist_queue,
> -					    struct drm_i915_gem_request,
> -					    execlist_link);
> -
> -	if (WARN_ON(!head_req || (head_req->ctx_hw_id != ctx_id)))
> -               return 0;
> -
> -	WARN(head_req->elsp_submitted == 0, "Never submitted head request\n");
> -
> -	if (--head_req->elsp_submitted > 0)
> -		return 0;
> -
> -	execlists_context_status_change(head_req, INTEL_CONTEXT_SCHEDULE_OUT);
> -
> -	list_del(&head_req->execlist_link);
> -	i915_gem_request_put(head_req);
> -
> -	return 1;
> +	return !engine->execlist_port[0].request;
>  }
>  
> -static u32
> -get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
> -		   u32 *context_id)
> +static bool execlists_elsp_ready(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_private *dev_priv = engine->i915;
> -	u32 status;
> -
> -	read_pointer %= GEN8_CSB_ENTRIES;
> -
> -	status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
> -
> -	if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
> -		return 0;
> +	int port;
>  
> -	*context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
> -							      read_pointer));
> +	port = 1; /* wait for a free slot */
> +	if (engine->disable_lite_restore_wa || engine->preempt_wa)
> +		port = 0; /* wait for GPU to be idle before continuing */
>  
> -	return status;
> +	return !engine->execlist_port[port].request;
>  }
>  
>  /*
> @@ -540,67 +535,56 @@ get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
>  static void intel_lrc_irq_handler(unsigned long data)
>  {
>  	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
>  	struct drm_i915_private *dev_priv = engine->i915;
> -	u32 status_pointer;
> -	unsigned int read_pointer, write_pointer;
> -	u32 csb[GEN8_CSB_ENTRIES][2];
> -	unsigned int csb_read = 0, i;
> -	unsigned int submit_contexts = 0;
>  
>  	intel_uncore_forcewake_get(dev_priv, engine->fw_domains);
>  
> -	status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(engine));
> -
> -	read_pointer = engine->next_context_status_buffer;
> -	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
> -	if (read_pointer > write_pointer)
> -		write_pointer += GEN8_CSB_ENTRIES;
> -
> -	while (read_pointer < write_pointer) {
> -		if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
> -			break;
> -		csb[csb_read][0] = get_context_status(engine, ++read_pointer,
> -						      &csb[csb_read][1]);
> -		csb_read++;
> -	}
> -
> -	engine->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
> -
> -	/* Update the read pointer to the old write pointer. Manual ringbuffer
> -	 * management ftw </sarcasm> */
> -	I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(engine),
> -		      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
> -				    engine->next_context_status_buffer << 8));
> -
> -	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
> -
> -	spin_lock(&engine->execlist_lock);
> +	if (!execlists_elsp_idle(engine)) {
> +		u32 __iomem *csb_mmio =
> +			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
> +		u32 __iomem *buf =
> +			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
> +		unsigned int csb, head, tail;
> +
> +		csb = readl(csb_mmio);
> +		head = GEN8_CSB_READ_PTR(csb);
> +		tail = GEN8_CSB_WRITE_PTR(csb);
> +		if (tail < head)
> +			tail += GEN8_CSB_ENTRIES;
> +		while (head < tail) {
> +			unsigned int idx = ++head % GEN8_CSB_ENTRIES;
> +			unsigned int status = readl(buf + 2 * idx);
> +
> +			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
> +				continue;
> +
> +			GEM_BUG_ON(port[0].count == 0);
> +			if (--port[0].count == 0) {
> +				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> +				execlists_context_status_change(port[0].request,
> +								INTEL_CONTEXT_SCHEDULE_OUT);
> +
> +				i915_gem_request_put(port[0].request);
> +				port[0] = port[1];
> +				memset(&port[1], 0, sizeof(port[1]));
> +
> +				engine->preempt_wa = false;
> +			}
>  
> -	for (i = 0; i < csb_read; i++) {
> -		if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) {
> -			if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) {
> -				if (execlists_check_remove_request(engine, csb[i][1]))
> -					WARN(1, "Lite Restored request removed from queue\n");
> -			} else
> -				WARN(1, "Preemption without Lite Restore\n");
> +			GEM_BUG_ON(port[0].count == 0 &&
> +				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
>  		}
>  
> -		if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE |
> -		    GEN8_CTX_STATUS_ELEMENT_SWITCH))
> -			submit_contexts +=
> -				execlists_check_remove_request(engine, csb[i][1]);
> +		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
> +				     GEN8_CSB_WRITE_PTR(csb) << 8),
> +		       csb_mmio);
>  	}
>  
> -	if (submit_contexts) {
> -		if (!engine->disable_lite_restore_wa ||
> -		    (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
> -			execlists_unqueue(engine);
> -	}
> +	if (execlists_elsp_ready(engine))
> +		execlists_dequeue(engine);
>  
> -	spin_unlock(&engine->execlist_lock);
> -
> -	if (unlikely(submit_contexts > 2))
> -		DRM_ERROR("More than two context complete events?\n");
> +	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
>  }
>  
>  static void execlists_submit_request(struct drm_i915_gem_request *request)
> @@ -609,12 +593,9 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
>  
>  	spin_lock_bh(&engine->execlist_lock);
>  
> -	i915_gem_request_get(request);
> -	request->ctx_hw_id = request->ctx->hw_id;
> -
> -	if (list_empty(&engine->execlist_queue))
> -		tasklet_hi_schedule(&engine->irq_tasklet);
>  	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> +	if (execlists_elsp_idle(engine))
> +		tasklet_hi_schedule(&engine->irq_tasklet);
>  
>  	spin_unlock_bh(&engine->execlist_lock);
>  }
> @@ -721,23 +702,6 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
>  	return 0;
>  }
>  
> -void intel_execlists_cancel_requests(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_gem_request *req, *tmp;
> -	LIST_HEAD(cancel_list);
> -
> -	WARN_ON(!mutex_is_locked(&engine->i915->drm.struct_mutex));
> -
> -	spin_lock_bh(&engine->execlist_lock);
> -	list_replace_init(&engine->execlist_queue, &cancel_list);
> -	spin_unlock_bh(&engine->execlist_lock);
> -
> -	list_for_each_entry_safe(req, tmp, &cancel_list, execlist_link) {
> -		list_del(&req->execlist_link);
> -		i915_gem_request_put(req);
> -	}
> -}
> -
>  static int intel_lr_context_pin(struct i915_gem_context *ctx,
>  				struct intel_engine_cs *engine)
>  {
> @@ -1258,7 +1222,6 @@ static void lrc_init_hws(struct intel_engine_cs *engine)
>  static int gen8_init_common_ring(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_private *dev_priv = engine->i915;
> -	unsigned int next_context_status_buffer_hw;
>  
>  	lrc_init_hws(engine);
>  
> @@ -1269,32 +1232,12 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>  	I915_WRITE(RING_MODE_GEN7(engine),
>  		   _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
>  		   _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
> -	POSTING_READ(RING_MODE_GEN7(engine));
>  
> -	/*
> -	 * Instead of resetting the Context Status Buffer (CSB) read pointer to
> -	 * zero, we need to read the write pointer from hardware and use its
> -	 * value because "this register is power context save restored".
> -	 * Effectively, these states have been observed:
> -	 *
> -	 *      | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) |
> -	 * BDW  | CSB regs not reset       | CSB regs reset       |
> -	 * CHT  | CSB regs not reset       | CSB regs not reset   |
> -	 * SKL  |         ?                |         ?            |
> -	 * BXT  |         ?                |         ?            |
> -	 */
> -	next_context_status_buffer_hw =
> -		GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(engine)));
> +	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine),
> +		   _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK |
> +				 GEN8_CSB_WRITE_PTR_MASK,
> +				 0));
>  
> -	/*
> -	 * When the CSB registers are reset (also after power-up / gpu reset),
> -	 * CSB write pointer is set to all 1's, which is not valid, use '5' in
> -	 * this special case, so the first element read is CSB[0].
> -	 */
> -	if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
> -		next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
> -
> -	engine->next_context_status_buffer = next_context_status_buffer_hw;
>  	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name);
>  
>  	intel_engine_init_hangcheck(engine);
> @@ -1680,10 +1623,6 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
>  	}
>  	intel_lr_context_unpin(dev_priv->kernel_context, engine);
>  
> -	engine->idle_lite_restore_wa = 0;
> -	engine->disable_lite_restore_wa = false;
> -	engine->ctx_desc_template = 0;
> -
>  	lrc_destroy_wa_ctx_obj(engine);
>  	engine->i915 = NULL;
>  }
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index a52cf57dbd40..4d70346500c2 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -97,6 +97,4 @@ int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv,
>  				    int enable_execlists);
>  void intel_execlists_enable_submission(struct drm_i915_private *dev_priv);
>  
> -void intel_execlists_cancel_requests(struct intel_engine_cs *engine);
> -
>  #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 84aea549de5d..2181d0a41a96 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -298,11 +298,14 @@ struct intel_engine_cs {
>  	/* Execlists */
>  	struct tasklet_struct irq_tasklet;
>  	spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
> +	struct execlist_port {
> +		struct drm_i915_gem_request *request;
> +		unsigned int count;
> +	} execlist_port[2];
>  	struct list_head execlist_queue;
>  	unsigned int fw_domains;
> -	unsigned int next_context_status_buffer;
> -	unsigned int idle_lite_restore_wa;
>  	bool disable_lite_restore_wa;
> +	bool preempt_wa;
>  	u32 ctx_desc_template;
>  
>  	/**
> -- 
> 2.9.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx