[PATCH v3 4/6] drm/i915/ttm: Break refcounting loops at device region unref time

Matthew Auld matthew.auld at intel.com
Mon Nov 15 10:49:32 UTC 2021


On 14/11/2021 11:12, Thomas Hellström wrote:
> There is an interesting refcounting loop:
> struct intel_memory_region has a struct ttm_resource_manager,
> ttm_resource_manager->move may hold a reference to i915_request,
> i915_request may hold a reference to intel_context,
> intel_context may hold a reference to drm_i915_gem_object,
> drm_i915_gem_object may hold a reference to intel_memory_region.

Would it help if we drop the per object region refcoutning? IIRC that 
was originally added to more cleanly appease some selftest teardown or 
something.

> 
> Break this loop when we drop the device reference count on the
> region by putting the region move fence.
> 
> Also hold dropping the device reference count until all objects of
> the region has been deleted, to avoid issues if proceeding with the
> device takedown while the region is still present.
> 
> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_ttm.c     |  1 +
>   drivers/gpu/drm/i915/gt/intel_region_lmem.c |  1 +
>   drivers/gpu/drm/i915/intel_memory_region.c  |  5 +++-
>   drivers/gpu/drm/i915/intel_memory_region.h  |  1 +
>   drivers/gpu/drm/i915/intel_region_ttm.c     | 28 +++++++++++++++++++++
>   drivers/gpu/drm/i915/intel_region_ttm.h     |  2 ++
>   6 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index 537a81445b90..a1df49378a0f 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -1044,6 +1044,7 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>   
>   static const struct intel_memory_region_ops ttm_system_region_ops = {
>   	.init_object = __i915_gem_ttm_object_init,
> +	.disable = intel_region_ttm_disable,
>   };
>   
>   struct intel_memory_region *
> diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
> index aec838ecb2ef..956916fd21f8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c
> +++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
> @@ -108,6 +108,7 @@ region_lmem_init(struct intel_memory_region *mem)
>   static const struct intel_memory_region_ops intel_region_lmem_ops = {
>   	.init = region_lmem_init,
>   	.release = region_lmem_release,
> +	.disable = intel_region_ttm_disable,
>   	.init_object = __i915_gem_ttm_object_init,
>   };
>   
> diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c
> index e7f7e6627750..1f67d2b68c24 100644
> --- a/drivers/gpu/drm/i915/intel_memory_region.c
> +++ b/drivers/gpu/drm/i915/intel_memory_region.c
> @@ -233,8 +233,11 @@ void intel_memory_regions_driver_release(struct drm_i915_private *i915)
>   		struct intel_memory_region *region =
>   			fetch_and_zero(&i915->mm.regions[i]);
>   
> -		if (region)
> +		if (region) {
> +			if (region->ops->disable)
> +				region->ops->disable(region);
>   			intel_memory_region_put(region);
> +		}
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/i915/intel_memory_region.h b/drivers/gpu/drm/i915/intel_memory_region.h
> index 3feae3353d33..9bb77eacd206 100644
> --- a/drivers/gpu/drm/i915/intel_memory_region.h
> +++ b/drivers/gpu/drm/i915/intel_memory_region.h
> @@ -52,6 +52,7 @@ struct intel_memory_region_ops {
>   
>   	int (*init)(struct intel_memory_region *mem);
>   	void (*release)(struct intel_memory_region *mem);
> +	void (*disable)(struct intel_memory_region *mem);
>   
>   	int (*init_object)(struct intel_memory_region *mem,
>   			   struct drm_i915_gem_object *obj,
> diff --git a/drivers/gpu/drm/i915/intel_region_ttm.c b/drivers/gpu/drm/i915/intel_region_ttm.c
> index 2e901a27e259..4219d83a2b19 100644
> --- a/drivers/gpu/drm/i915/intel_region_ttm.c
> +++ b/drivers/gpu/drm/i915/intel_region_ttm.c
> @@ -114,6 +114,34 @@ void intel_region_ttm_fini(struct intel_memory_region *mem)
>   	mem->region_private = NULL;
>   }
>   
> +/**
> + * intel_region_ttm_disable - A TTM region disable callback helper
> + * @mem: The memory region.
> + *
> + * A helper that ensures that nothing any longer references a region at
> + * device takedown. Breaks refcounting loops and waits for objects in the
> + * region to be deleted.
> + */
> +void intel_region_ttm_disable(struct intel_memory_region *mem)
> +{
> +	struct ttm_resource_manager *man = mem->region_private;
> +
> +	/*
> +	 * Put the region's move fences. This releases requests that
> +	 * may hold on to contexts and vms that may hold on to buffer
> +	 * objects that may have a refcount on the region. :/
> +	 */
> +	if (man)
> +		ttm_resource_manager_cleanup(man);
> +
> +	/* Flush objects that may just have been freed */
> +	i915_gem_flush_free_objects(mem->i915);
> +
> +	/* Wait until the only region reference left is our own. */
> +	while (kref_read(&mem->kref) > 1)
> +		msleep(20);

If we leak an object, I guess we get an infinite loop here at driver 
release?

> +}
> +
>   /**
>    * intel_region_ttm_resource_to_rsgt -
>    * Convert an opaque TTM resource manager resource to a refcounted sg_table.
> diff --git a/drivers/gpu/drm/i915/intel_region_ttm.h b/drivers/gpu/drm/i915/intel_region_ttm.h
> index 7bbe2b46b504..197a8c179370 100644
> --- a/drivers/gpu/drm/i915/intel_region_ttm.h
> +++ b/drivers/gpu/drm/i915/intel_region_ttm.h
> @@ -22,6 +22,8 @@ int intel_region_ttm_init(struct intel_memory_region *mem);
>   
>   void intel_region_ttm_fini(struct intel_memory_region *mem);
>   
> +void intel_region_ttm_disable(struct intel_memory_region *mem);
> +
>   struct i915_refct_sgt *
>   intel_region_ttm_resource_to_rsgt(struct intel_memory_region *mem,
>   				  struct ttm_resource *res);
> 


More information about the dri-devel mailing list