[Intel-gfx] [RFC 2/2] drm/i915/dgfx: Runtime resume the dgpu on user fault

Wed Aug 17 18:11:06 UTC 2022

On 17/08/2022 16:09, Anshuman Gupta wrote:
> Runtime resume the dgpu(when gem object lies in lmem).
> This will transition the dgpu graphics function to D0
> state if it was in D3 in order to access the mmap memory
> mappings.
> 
> Cc: Matthew Auld <matthew.auld at intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Signed-off-by: Anshuman Gupta <anshuman.gupta at intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 25 +++++++++++++++++++------
>   1 file changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index b49823d599e7..1e9b07473a8f 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -1020,6 +1020,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
>   	struct ttm_buffer_object *bo = area->vm_private_data;
>   	struct drm_device *dev = bo->base.dev;
>   	struct drm_i915_gem_object *obj;
> +	intel_wakeref_t wakeref = 0;
>   	vm_fault_t ret;
>   	int idx;
>   
> @@ -1027,18 +1028,24 @@ static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
>   	if (!obj)
>   		return VM_FAULT_SIGBUS;
>   
> +	if (i915_gem_object_is_lmem(obj))

We shouldn't call this without first locking the object (see 
bo_vm_reserve below), since it could be in the process of being moved to 
system memory or vice versa. For example, below we check is_lmem() again 
(this time with the lock held), which might return true, even though 
here it returned false, which means we can now race against the 
i915_gem_runtime_suspend() modifying the list as we add something.

We ofc still need to audit all the kernel internal users that are 
touching lmem though a CPU mapping, and making sure we have the right 
pm_get/put wrapping those accesses.

> +		wakeref = intel_runtime_pm_get(&to_i915(obj->base.dev)->runtime_pm);
> +
>   	/* Sanity check that we allow writing into this object */
>   	if (unlikely(i915_gem_object_is_readonly(obj) &&
> -		     area->vm_flags & VM_WRITE))
> -		return VM_FAULT_SIGBUS;
> +		     area->vm_flags & VM_WRITE)) {
> +		ret = VM_FAULT_SIGBUS;
> +		goto out_rpm;
> +	}
>   
>   	ret = ttm_bo_vm_reserve(bo, vmf);
>   	if (ret)
> -		return ret;
> +		goto out_rpm;
>   
>   	if (obj->mm.madv != I915_MADV_WILLNEED) {
>   		dma_resv_unlock(bo->base.resv);
> -		return VM_FAULT_SIGBUS;
> +		ret = VM_FAULT_SIGBUS;
> +		goto out_rpm;
>   	}
>   
>   	if (!i915_ttm_resource_mappable(bo->resource)) {
> @@ -1062,7 +1069,8 @@ static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
>   		if (err) {
>   			drm_dbg(dev, "Unable to make resource CPU accessible\n");
>   			dma_resv_unlock(bo->base.resv);
> -			return VM_FAULT_SIGBUS;
> +			ret = VM_FAULT_SIGBUS;
> +			goto out_rpm;
>   		}
>   	}
>   
> @@ -1078,11 +1086,16 @@ static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
>   		list_add(&obj->userfault_link, &to_gt(to_i915(obj->base.dev))->lmem_userfault_list);
>   
>   	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
> -		return ret;
> +		goto out_rpm;
>   
>   	i915_ttm_adjust_lru(obj);
>   
>   	dma_resv_unlock(bo->base.resv);
> +
> +out_rpm:
> +	if (wakeref)
> +		intel_runtime_pm_put(&to_i915(obj->base.dev)->runtime_pm, wakeref);

Do we need something like DRM_I915_USERFAULT_AUTOSUSPEND here?

> +
>   	return ret;
>   }
>