[Intel-xe] [PATCH] RFC drm/xe: Add mem_access_get in gem_create_ioctl

Ghimiray, Himal Prasad himal.prasad.ghimiray at intel.com
Mon Oct 9 08:38:10 UTC 2023


Hi Riana,


On 09-10-2023 11:52, Riana Tauro wrote:
> gem_create_ioctl does not have a mem_access_get till it reaches
> xe_bo_move.
>
> When the device is runtime suspended (in D3cold), new bo created
> as part of gem_create_ioctl steals the buddy block of the kernel objects
> that are yet to be restored as part of runtime resume (D3cold). The runtime
> resume triggers only in xe_bo_move. While trying to restore the kernel
> objects it finds the buddy block is not free.
> Tries to evict the new bo which is already locked causing a deadlock
>
> Prevent deadlock by taking mem_access get early in the ioctl
>
> 	INFO: task kworker/1:1:44 blocked for more than 61 seconds.
> 	"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> 	task:kworker/1:1     state:D stack:25272 pid:44 ppid:2  flags:0x00004000
> 	[  +0.008395] Workqueue: pm pm_runtime_work
> 	[  +0.004068] Call Trace:
> 	[  +0.002486]  <TASK>
> 	[  +0.002161]  __schedule+0x6f5/0x1640
> 	[  +0.003702]  ? __pfx___schedule+0x10/0x10
> 	[  +0.004133]  ? __ww_mutex_lock.constprop.0+0xf4f/0x1e60
> 	[  +0.005330]  schedule+0x92/0x120
> 	....
> 	[  +0.003922]  ttm_bo_mem_space+0x46d/0x490 [ttm]
> 	[  +0.004586]  xe_bo_restore_pinned+0x200/0x320 [xe]
> 	[  +0.005007]  ? __pfx_xe_bo_restore_pinned+0x10/0x10 [xe]
> 	[  +0.005503]  ? __pfx__printk+0x10/0x10
> 	[  +0.003791]  ? __pfx_do_raw_spin_lock+0x10/0x10
> 	[  +0.004597]  xe_bo_restore_kernel+0x2e4/0x470 [xe]
> 	[  +0.005521]  xe_pm_runtime_resume+0x20a/0x750 [xe]
> 	....
> 	INFO: task xe_mmap:1836 blocked for more than 61 seconds.
> 	"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> 	task:xe_mmap   state:D stack:23600 pid:1836  ppid:1831 flags:0x00004002
> 	[  +0.008395] Call Trace:
> 	[  +0.002486]  <TASK>
> 	[  +0.003271]  rpm_resume+0x341/0xad0
> 	[  +0.005269]  __pm_runtime_resume+0x53/0xc0
> 	[  +0.004152]  xe_device_mem_access_get+0x2b/0x60 [xe]
> 	[  +0.005172]  xe_bo_move+0x2ef/0x9f0 [xe]
> 	[  +0.004131]  ttm_bo_handle_move_mem+0x15a/0x230 [ttm]
>
> Link:https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/256
>
> Cc: Matthew Auld<matthew.auld at intel.com>
> Signed-off-by: Riana Tauro<riana.tauro at intel.com>
> ---
>   drivers/gpu/drm/xe/xe_bo.c | 22 +++++++++++++++++-----
>   1 file changed, 17 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 61789c0e88fb..e453a5264c82 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -630,6 +630,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
>   	bool tt_has_data;
>   	bool needs_clear;
>   	int ret = 0;
> +	bool device_awake;
>   
>   	/* Bo creation path, moving to system or TT. No clearing required. */
>   	if (!old_mem && ttm) {
> @@ -712,7 +713,8 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
>   	xe_tile_assert(tile, tile->migrate);
>   
>   	trace_xe_bo_move(bo);
> -	xe_device_mem_access_get(xe);
> +
> +	device_awake = xe_device_mem_access_get_if_ongoing(xe);

IIRC xe_bo_move is called in eviction path too. Won't it be safe to use 
xe_device_mem_access_get here too ?

BR

Himal

>   
>   	if (xe_bo_is_pinned(bo) && !xe_bo_is_user(bo)) {
>   		/*
> @@ -735,7 +737,8 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
>   
>   				if (XE_WARN_ON(new_mem->start == XE_BO_INVALID_OFFSET)) {
>   					ret = -EINVAL;
> -					xe_device_mem_access_put(xe);
> +					if (device_awake)
> +						xe_device_mem_access_put(xe);
>   					goto out;
>   				}
>   
> @@ -753,7 +756,8 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
>   						bo, bo, old_mem, new_mem);
>   		if (IS_ERR(fence)) {
>   			ret = PTR_ERR(fence);
> -			xe_device_mem_access_put(xe);
> +			if (device_awake)
> +				xe_device_mem_access_put(xe);
>   			goto out;
>   		}
>   		if (!move_lacks_source) {
> @@ -778,10 +782,12 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
>   		dma_fence_put(fence);
>   	}
>   
> -	xe_device_mem_access_put(xe);
> +	if (device_awake)
> +		xe_device_mem_access_put(xe);
>   	trace_printk("new_mem->mem_type=%d\n", new_mem->mem_type);
>   
>   out:
> +
>   	return ret;
>   
>   }
> @@ -1810,13 +1816,17 @@ int xe_gem_create_ioctl(struct drm_device *dev, void *data,
>   		bo_flags |= XE_BO_NEEDS_CPU_ACCESS;
>   	}
>   
> +	xe_device_mem_access_get(xe);
>   	if (args->vm_id) {
>   		vm = xe_vm_lookup(xef, args->vm_id);
> -		if (XE_IOCTL_DBG(xe, !vm))
> +		if (XE_IOCTL_DBG(xe, !vm)) {
> +			xe_device_mem_access_put(xe);
>   			return -ENOENT;
> +		}
>   		err = xe_vm_lock(vm, true);
>   		if (err) {
>   			xe_vm_put(vm);
> +			xe_device_mem_access_put(xe);
>   			return err;
>   		}
>   	}
> @@ -1845,6 +1855,8 @@ int xe_gem_create_ioctl(struct drm_device *dev, void *data,
>   		xe_vm_unlock(vm);
>   		xe_vm_put(vm);
>   	}
> +
> +	xe_device_mem_access_put(xe);
>   	return err;
>   }
>   
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/intel-xe/attachments/20231009/49a2bb86/attachment.htm>


More information about the Intel-xe mailing list