[Intel-gfx] [PATCH 1/4] drm/i915: handle teardown of HWSP when context is freed

Fri Jan 22 06:51:50 PST 2016

On 22/01/16 10:28, Mika Kuoppala wrote:
> Dave Gordon <david.s.gordon at intel.com> writes:
>
>> Existing code did a kunmap() on the wrong page, and didn't account for the
>> fact that the HWSP and the default context are different offsets within the
>> same GEM object.
>>
>
> Just to note here that usually we try to split bug fixes early in the
> series with minimal code changes. This helps cherrypickups
> to fixes/stable. And also lessens the probability that we accidentally
> revert bugfix when some other problem occurs with the patch.

OK, let's drop this one from the series (they're orthogonal code 
changes, only related by the fact they're all to do with init/fini), and 
I'll resubmit it separately.

Please continue this series from 2/4 ...

>> This patch improves symmetry by creating lrc_teardown_hardware_status_page()
>> to complement lrc_setup_hardware_status_page(), making sure that they both
>> take the same argument (pointer to engine). They encapsulate all the details
>> of the relationship between the default context and the status page, so the
>> rest of the LRC code doesn't have to know about it.
>>
>> Signed-off-by: Dave Gordon <david.s.gordon at intel.com>
>> ---
>>   drivers/gpu/drm/i915/intel_lrc.c | 57 ++++++++++++++++++++++++++++------------
>>   1 file changed, 40 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 73d4347..3914120 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -226,9 +226,8 @@ enum {
>>   #define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
>>
>>   static int intel_lr_context_pin(struct drm_i915_gem_request *rq);
>> -static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
>> -		struct drm_i915_gem_object *default_ctx_obj);
>> -
>> +static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring);
>> +static void lrc_teardown_hardware_status_page(struct intel_engine_cs *ring);
>>
>>   /**
>>    * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists
>> @@ -1536,8 +1535,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring)
>>   	struct drm_i915_private *dev_priv = dev->dev_private;
>>   	u8 next_context_status_buffer_hw;
>>
>> -	lrc_setup_hardware_status_page(ring,
>> -				dev_priv->kernel_context->engine[ring->id].state);
>> +	lrc_setup_hardware_status_page(ring);
>>
>>   	I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask));
>>   	I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff);
>> @@ -1992,10 +1990,9 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
>>   	i915_cmd_parser_fini_ring(ring);
>>   	i915_gem_batch_pool_fini(&ring->batch_pool);
>>
>> -	if (ring->status_page.obj) {
>> -		kunmap(sg_page(ring->status_page.obj->pages->sgl));
>> -		ring->status_page.obj = NULL;
>> -	}
>> +	/* Status page should have gone already */
>> +	WARN_ON(ring->status_page.page_addr);
>> +	WARN_ON(ring->status_page.obj);
>>
>>   	ring->disable_lite_restore_wa = false;
>>   	ring->ctx_desc_template = 0;
>> @@ -2434,6 +2431,11 @@ void intel_lr_context_free(struct intel_context *ctx)
>>   			continue;
>>
>>   		if (ctx == ctx->i915->kernel_context) {
>> +			/*
>> +			 * The HWSP is part of the default context
>> +			 * object in LRC mode.
>> +			 */
>> +			lrc_teardown_hardware_status_page(ringbuf->ring);
>>   			intel_unpin_ringbuffer_obj(ringbuf);
>>   			i915_gem_object_ggtt_unpin(ctx_obj);
>>   		}
>> @@ -2482,24 +2484,45 @@ uint32_t intel_lr_context_size(struct intel_engine_cs *ring)
>>   	return ret;
>>   }
>>
>> -static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
>> -		struct drm_i915_gem_object *default_ctx_obj)
>> +static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring)
>>   {
>> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>> +	struct drm_i915_private *dev_priv = to_i915(ring->dev);
>> +	struct intel_context *dctx = dev_priv->kernel_context;
>> +	struct drm_i915_gem_object *dctx_obj = dctx->engine[ring->id].state;
>> +	u64 dctx_addr = i915_gem_obj_ggtt_offset(dctx_obj);
>>   	struct page *page;
>>
>> -	/* The HWSP is part of the default context object in LRC mode. */
>> -	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj)
>> -			+ LRC_PPHWSP_PN * PAGE_SIZE;
>> -	page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN);
>> +	/*
>> +	 * The HWSP is part of the default context object in LRC mode.
>> +	 * Note that it doesn't contribute to the refcount!
>> +	 */
>> +	page = i915_gem_object_get_page(dctx_obj, LRC_PPHWSP_PN);
>>   	ring->status_page.page_addr = kmap(page);
>> -	ring->status_page.obj = default_ctx_obj;
>> +	ring->status_page.gfx_addr = dctx_addr + LRC_PPHWSP_PN * PAGE_SIZE;
>> +	ring->status_page.obj = dctx_obj;
>>
>>   	I915_WRITE(RING_HWS_PGA(ring->mmio_base),
>>   			(u32)ring->status_page.gfx_addr);
>>   	POSTING_READ(RING_HWS_PGA(ring->mmio_base));
>>   }
>>
>> +/* This should be called before the default context is destroyed */
>> +static void lrc_teardown_hardware_status_page(struct intel_engine_cs *ring)
>> +{
>> +	struct drm_i915_gem_object *dctx_obj = ring->status_page.obj;
>> +	struct page *page;
>> +
>> +	WARN_ON(!dctx_obj);
>> +
>> +	if (ring->status_page.page_addr) {
>> +		page = i915_gem_object_get_page(dctx_obj, LRC_PPHWSP_PN);
>> +		kunmap(page);
>> +		ring->status_page.page_addr = NULL;
>> +	}
>> +
>
> As the page is always kmapped along with setting of status_page.obj,
> I fail to see why we need the check here for the page_addr.

The WARN_ON() tells the developer something unexpected happened.

The page_addr test avoids doing something that may cause an OOPS so that 
the driver can continue (to unload!).

IMHO, teardown code *in particular* should be written to anticipate 
being called in all sorts of inconsistent states, because a common error 
handling strategy is just to say,

     oops, something bad happened at some stage of setup!
     let's tear down everything and pass the error back

which is certainly easier than tracking exactly where the error occurred 
and undoing only the steps known to have succeeded -- that leads to lots 
of "goto err_1", "goto err_2", etc :(

This code therefore copes with the situation where the object exists, 
but the mapping hasn't been set up. With the structure of the code in 
lrc_setup_hardware_status_page() as implemented above, this won't 
happen. But someone might someday add a new early error exit that left 
these variables in that state, and the failure in setup would then 
result in the driver load failing, at which point teardown() would be 
called ...

So I might as well guard the kunmap() with a test that the page has 
actually been kmap()ed, to save some other developer from frustration in 
the future.

.Dave.

> How about just:
>
> if (WARN_ON(!dctx_obj))
>     return;
>
> -Mika
>
>> +	ring->status_page.obj = NULL;
>> +}
>> +
>>   /**
>>    * intel_lr_context_deferred_alloc() - create the LRC specific bits of a context
>>    * @ctx: LR context to create.
>> --
>> 1.9.1
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx