[Intel-gfx] [PATCH 4/5] drm/i915: Track page table reload need

Thu Mar 19 04:26:13 PDT 2015

On 3/19/2015 9:01 AM, Mika Kuoppala wrote:
> Michel Thierry <michel.thierry at intel.com> writes:
>
>> From: Ben Widawsky <benjamin.widawsky at intel.com>
>>
>> This patch was formerly known as, "Force pd restore when PDEs change,
>> gen6-7." I had to change the name because it is needed for GEN8 too.
>>
>> The real issue this is trying to solve is when a new object is mapped
>> into the current address space. The GPU does not snoop the new mapping
>> so we must do the gen specific action to reload the page tables.
>>
>> GEN8 and GEN7 do differ in the way they load page tables for the RCS.
>> GEN8 does so with the context restore, while GEN7 requires the proper
>> load commands in the command streamer. Non-render is similar for both.
>>
>> Caveat for GEN7
>> The docs say you cannot change the PDEs of a currently running context.
>> We never map new PDEs of a running context, and expect them to be
>> present - so I think this is okay. (We can unmap, but this should also
>> be okay since we only unmap unreferenced objects that the GPU shouldn't
>> be tryingto va->pa xlate.) The MI_SET_CONTEXT command does have a flag
>> to signal that even if the context is the same, force a reload. It's
>> unclear exactly what this does, but I have a hunch it's the right thing
>> to do.
>>
>> The logic assumes that we always emit a context switch after mapping new
>> PDEs, and before we submit a batch. This is the case today, and has been
>> the case since the inception of hardware contexts. A note in the comment
>> let's the user know.
>>
>> It's not just for gen8. If the current context has mappings change, we
>> need a context reload to switch
>>
>> v2: Rebased after ppgtt clean up patches. Split the warning for aliasing
>> and true ppgtt options. And do not break aliasing ppgtt, where to->ppgtt
>> is always null.
>>
>> v3: Invalidate PPGTT TLBs inside alloc_va_range.
>>
>> v4: Rename ppgtt_invalidate_tlbs to mark_tlbs_dirty and move
>> pd_dirty_rings from i915_address_space to i915_hw_ppgtt. Fixes when
>> neither ctx->ppgtt and aliasing_ppgtt exist.
>>
>> v5: Removed references to teardown_va_range.
>>
>> v6: Updated needs_pd_load_pre/post.
>>
>> Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
>> Signed-off-by: Michel Thierry <michel.thierry at intel.com> (v2+)
>> ---
>>   drivers/gpu/drm/i915/i915_gem_context.c    | 26 ++++++++++++++++++++------
>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 11 +++++++++++
>>   drivers/gpu/drm/i915/i915_gem_gtt.c        | 11 +++++++++++
>>   drivers/gpu/drm/i915/i915_gem_gtt.h        |  1 +
>>   4 files changed, 43 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index b6ea85d..9197ff4 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -573,8 +573,20 @@ static inline bool should_skip_switch(struct intel_engine_cs *ring,
>>   				      struct intel_context *from,
>>   				      struct intel_context *to)
>>   {
>> -	if (from == to && !to->remap_slice)
>> -		return true;
>> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>> +
>> +	if (to->remap_slice)
>> +		return false;
>> +
>> +	if (to->ppgtt) {
>> +		if (from == to && !test_bit(ring->id,
>> +				&to->ppgtt->pd_dirty_rings))
>> +			return true;
>> +	} else if (dev_priv->mm.aliasing_ppgtt) {
>> +		if (from == to && !test_bit(ring->id,
>> +				&dev_priv->mm.aliasing_ppgtt->pd_dirty_rings))
>> +			return true;
>> +	}
>>   
>>   	return false;
>>   }
>> @@ -610,10 +622,7 @@ needs_pd_load_post(struct intel_engine_cs *ring, struct intel_context *to)
>>   	if (ring != &dev_priv->ring[RCS])
>>   		return false;
>>   
>> -	if (!to->legacy_hw_ctx.initialized)
>> -		return true;
>> -
>> -	if (i915_gem_context_is_default(to))
>> +	if (&to->ppgtt->pd_dirty_rings)
> if (to->ppgtt->pd_dirty_rings) ?
Sorry, wrong copy/paste from test/clear_bit functions.

>>   		return true;
>>   
>>   	return false;
>> @@ -664,6 +673,9 @@ static int do_switch(struct intel_engine_cs *ring,
>>   		ret = to->ppgtt->switch_mm(to->ppgtt, ring);
>>   		if (ret)
>>   			goto unpin_out;
>> +
>> +		/* Doing a PD load always reloads the page dirs */
>> +		clear_bit(ring->id, &to->ppgtt->pd_dirty_rings);
>>   	}
>>   
>>   	if (ring != &dev_priv->ring[RCS]) {
>> @@ -696,6 +708,8 @@ static int do_switch(struct intel_engine_cs *ring,
>>   
>>   	if (!to->legacy_hw_ctx.initialized || i915_gem_context_is_default(to))
>>   		hw_flags |= MI_RESTORE_INHIBIT;
>> +	else if (to->ppgtt && test_and_clear_bit(ring->id, &to->ppgtt->pd_dirty_rings))
>> +		hw_flags |= MI_FORCE_RESTORE;
>>   
>>   	ret = mi_set_context(ring, to, hw_flags);
>>   	if (ret)
>> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> index dc10bc4..fd0241a 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> @@ -1187,6 +1187,13 @@ i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
>>   	if (ret)
>>   		goto error;
>>   
>> +	if (ctx->ppgtt)
>> +		WARN(ctx->ppgtt->pd_dirty_rings & (1<<ring->id),
>> +			"%s didn't clear reload\n", ring->name);
>> +	else if (dev_priv->mm.aliasing_ppgtt)
>> +		WARN(dev_priv->mm.aliasing_ppgtt->pd_dirty_rings &
>> +			(1<<ring->id), "%s didn't clear reload\n", ring->name);
>> +
>>   	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
>>   	instp_mask = I915_EXEC_CONSTANTS_MASK;
>>   	switch (instp_mode) {
>> @@ -1456,6 +1463,10 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>   	if (ret)
>>   		goto err;
>>   
>> +	/* XXX: Reserve has possibly change PDEs which means we must do a
>> +	 * context switch before we can coherently read some of the reserved
>> +	 * VMAs. */
>> +
> With ggtt we should be always be safe as the ptes are snooped. And we
> don't do dynamic allocation on the ggtt anyways. So why we need the
> ctx switch?
>
> As I fail to connect the dots, could you please elaborate the comment
> section.
Right, this comment is only applicable to full PPGTT, which won't happen 
here:
i915_gem_execbuffer_reserve()
->i915_gem_execbuffer_reserve_vma()
-->i915_gem_object_pin() (vma == NULL || !drm_mm_node_allocated(&vma->node))
--->i915_gem_object_bind_to_vm (allocate before insert / bind)
---->gen6_alloc_va_range

>
> --Mika
>
>>   	/* The objects are in their final locations, apply the relocations. */
>>   	if (need_relocs)
>>   		ret = i915_gem_execbuffer_relocate(eb);
>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> index c50b4ab..1758fd9 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> @@ -1161,6 +1161,16 @@ static void gen6_ppgtt_unmap_pages(struct i915_hw_ppgtt *ppgtt)
>>   			       4096, PCI_DMA_BIDIRECTIONAL);
>>   }
>>   
>> +/* PDE TLBs are a pain invalidate pre GEN8. It requires a context reload. If we
>> + * are switching between contexts with the same LRCA, we also must do a force
>> + * restore.
>> + */
>> +static inline void mark_tlbs_dirty(struct i915_hw_ppgtt *ppgtt)
>> +{
>> +	/* If current vm != vm, */
>> +	ppgtt->pd_dirty_rings = INTEL_INFO(ppgtt->base.dev)->ring_mask;
>> +}
>> +
>>   static int gen6_alloc_va_range(struct i915_address_space *vm,
>>   			       uint64_t start, uint64_t length)
>>   {
>> @@ -1180,6 +1190,7 @@ static int gen6_alloc_va_range(struct i915_address_space *vm,
>>   				GEN6_PTES);
>>   	}
>>   
>> +	mark_tlbs_dirty(ppgtt);
>>   	return 0;
>>   }
>>   
>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> index c30af62..074b1fc 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> @@ -300,6 +300,7 @@ struct i915_hw_ppgtt {
>>   	struct i915_address_space base;
>>   	struct kref ref;
>>   	struct drm_mm_node node;
>> +	unsigned long pd_dirty_rings;
>>   	unsigned num_pd_entries;
>>   	unsigned num_pd_pages; /* gen8+ */
>>   	union {
>> -- 
>> 2.1.1