[PATCH v2 6/7] drm/amdgpu: Fix wait IOCTL lockup issues.

Christian König christian.koenig at amd.com
Fri Dec 13 10:54:35 UTC 2024


Am 12.12.24 um 15:25 schrieb Arunpravin Paneer Selvam:
> Fix the wait IOCTL lockup issue.
>
> [Dec 6 17:53] watchdog: BUG: soft lockup - CPU#4 stuck for 26s! [Xwayland:cs0:2634]
> [  +0.000002] RIP: 0010:amdgpu_userq_wait_ioctl+0x3ce/0xfe0 [amdgpu]
> [  +0.000003] RSP: 0018:ffffada901f4fc38 EFLAGS: 00000202
> [  +0.000003] RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000001
> [  +0.000001] RDX: ffff9fc28b974048 RSI: 0000000000000010 RDI: ffffada901f4fce8
> [  +0.000002] RBP: ffffada901f4fd58 R08: ffff9fc28b974148 R09: 0000000000000000
> [  +0.000002] R10: ffffada901f4fbf0 R11: 0000000000000001 R12: ffff9fc28ed1ac00
> [  +0.000002] R13: 0000000000000000 R14: 0000000000000010 R15: ffffada901f4fe00
> [  +0.000002] FS:  00007f3a00a00640(0000) GS:ffff9fc99e800000(0000) knlGS:0000000000000000
> [  +0.000002] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  +0.000002] CR2: 00007f8e92137020 CR3: 000000013380e000 CR4: 0000000000350ef0
> [  +0.000002] DR0: ffffffff90921610 DR1: ffffffff90921611 DR2: ffffffff90921612
> [  +0.000001] DR3: ffffffff90921613 DR6: 00000000ffff0ff0 DR7: 0000000000000600
> [  +0.000002] Call Trace:
> [  +0.000002]  <IRQ>
> [  +0.000004]  ? show_regs+0x69/0x80
> [  +0.000005]  ? watchdog_timer_fn+0x271/0x300
> [  +0.000005]  ? __pfx_watchdog_timer_fn+0x10/0x10
> [  +0.000003]  ? __hrtimer_run_queues+0x114/0x2a0
> [  +0.000006]  ? hrtimer_interrupt+0x110/0x240
> [  +0.000005]  ? __sysvec_apic_timer_interrupt+0x73/0x180
> [  +0.000004]  ? sysvec_apic_timer_interrupt+0xaa/0xd0
> [  +0.000004]  </IRQ>
> [  +0.000002]  <TASK>
> [  +0.000002]  ? asm_sysvec_apic_timer_interrupt+0x1b/0x20
> [  +0.000006]  ? amdgpu_userq_wait_ioctl+0x3ce/0xfe0 [amdgpu]
> [  +0.000162]  ? amdgpu_userq_wait_ioctl+0x3cc/0xfe0 [amdgpu]
> [  +0.000156]  ? finish_task_switch.isra.0+0x94/0x290
> [  +0.000010]  ? __pfx_amdgpu_userq_wait_ioctl+0x10/0x10 [amdgpu]
> [  +0.000161]  drm_ioctl_kernel+0xaa/0x100 [drm]
> [  +0.000025]  drm_ioctl+0x29d/0x500 [drm]
> [  +0.000017]  ? __pfx_amdgpu_userq_wait_ioctl+0x10/0x10 [amdgpu]
> [  +0.000160]  ? srso_return_thunk+0x5/0x5f
> [  +0.000004]  ? srso_return_thunk+0x5/0x5f
> [  +0.000003]  ? _raw_spin_unlock_irqrestore+0x27/0x50
> [  +0.000004]  amdgpu_drm_ioctl+0x4b/0x80 [amdgpu]
> [  +0.000141]  __x64_sys_ioctl+0x95/0xd0
> [  +0.000005]  x64_sys_call+0x1205/0x20d0
> [  +0.000003]  do_syscall_64+0x4d/0x120
> [  +0.000004]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [  +0.000002] RIP: 0033:0x7f3a0bb1a94f
> [  +0.000002] RSP: 002b:00007f3a009ff870 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> [  +0.000003] RAX: ffffffffffffffda RBX: 00007f3a009ff9f0 RCX: 00007f3a0bb1a94f
> [  +0.000001] RDX: 00007f3a009ff9f0 RSI: 00000000c0406458 RDI: 000000000000000c
> [  +0.000002] RBP: 00007f3a009ff8f0 R08: 0000000000000001 R09: 0000000000000000
> [  +0.000002] R10: 0000000000000002 R11: 0000000000000246 R12: 0000561824bf39e0
> [  +0.000002] R13: 00000000c0406458 R14: 000000000000000c R15: 0000561824a25b60
> [  +0.000005]  </TASK>
> [ +27.998164] watchdog: BUG: soft lockup - CPU#4 stuck for 52s! [Xwayland:cs0:2634]
> [  +0.000002] RIP: 0010:drm_exec_unlock_all+0x76/0xc0 [drm_exec]
> [  +0.000002] RSP: 0018:ffffada901f4fbf8 EFLAGS: 00000246
> [  +0.000003] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
> [  +0.000002] RDX: ffff9fc28b974048 RSI: 0000000000000010 RDI: 0000000000000000
> [  +0.000001] RBP: ffffada901f4fc10 R08: ffff9fc28b974148 R09: 0000000000000000
> [  +0.000002] R10: ffffada901f4fbf0 R11: 0000000000000001 R12: ffff9fc28ed1ac00
> [  +0.000002] R13: 00000000ffffffff R14: ffffada901f4fce8 R15: ffffada901f4fe00
> [  +0.000002] FS:  00007f3a00a00640(0000) GS:ffff9fc99e800000(0000) knlGS:0000000000000000
> [  +0.000002] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  +0.000002] CR2: 00007f8e92137020 CR3: 000000013380e000 CR4: 0000000000350ef0
> [  +0.000002] DR0: ffffffff90921610 DR1: ffffffff90921611 DR2: ffffffff90921612
> [  +0.000002] DR3: ffffffff90921613 DR6: 00000000ffff0ff0 DR7: 0000000000000600
> [  +0.000002] Call Trace:
> [  +0.000002]  <IRQ>
> [  +0.000003]  ? show_regs+0x69/0x80
> [  +0.000006]  ? watchdog_timer_fn+0x271/0x300
> [  +0.000004]  ? __pfx_watchdog_timer_fn+0x10/0x10
> [  +0.000003]  ? __hrtimer_run_queues+0x114/0x2a0
> [  +0.000006]  ? hrtimer_interrupt+0x110/0x240
> [  +0.000005]  ? __sysvec_apic_timer_interrupt+0x73/0x180
> [  +0.000004]  ? sysvec_apic_timer_interrupt+0xaa/0xd0
> [  +0.000004]  </IRQ>
> [  +0.000002]  <TASK>
> [  +0.000002]  ? asm_sysvec_apic_timer_interrupt+0x1b/0x20
> [  +0.000006]  ? drm_exec_unlock_all+0x76/0xc0 [drm_exec]
> [  +0.000004]  drm_exec_cleanup+0x77/0x90 [drm_exec]
> [  +0.000004]  amdgpu_userq_wait_ioctl+0x3cc/0xfe0 [amdgpu]
> [  +0.000206]  ? finish_task_switch.isra.0+0x94/0x290
> [  +0.000010]  ? __pfx_amdgpu_userq_wait_ioctl+0x10/0x10 [amdgpu]
> [  +0.000159]  drm_ioctl_kernel+0xaa/0x100 [drm]
> [  +0.000026]  drm_ioctl+0x29d/0x500 [drm]
> [  +0.000017]  ? __pfx_amdgpu_userq_wait_ioctl+0x10/0x10 [amdgpu]
> [  +0.000163]  ? srso_return_thunk+0x5/0x5f
> [  +0.000005]  ? srso_return_thunk+0x5/0x5f
> [  +0.000002]  ? _raw_spin_unlock_irqrestore+0x27/0x50
> [  +0.000005]  amdgpu_drm_ioctl+0x4b/0x80 [amdgpu]
> [  +0.000143]  __x64_sys_ioctl+0x95/0xd0
> [  +0.000005]  x64_sys_call+0x1205/0x20d0
> [  +0.000004]  do_syscall_64+0x4d/0x120
> [  +0.000003]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [  +0.000002] RIP: 0033:0x7f3a0bb1a94f
> [  +0.000002] RSP: 002b:00007f3a009ff870 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> [  +0.000003] RAX: ffffffffffffffda RBX: 00007f3a009ff9f0 RCX: 00007f3a0bb1a94f
> [  +0.000002] RDX: 00007f3a009ff9f0 RSI: 00000000c0406458 RDI: 000000000000000c
> [  +0.000002] RBP: 00007f3a009ff8f0 R08: 0000000000000001 R09: 0000000000000000
> [  +0.000001] R10: 0000000000000002 R11: 0000000000000246 R12: 0000561824bf39e0
> [  +0.000002] R13: 00000000c0406458 R14: 000000000000000c R15: 0000561824a25b60
> [  +0.000006]  </TASK>
>
> Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam at amd.com>
> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   | 122 +++++++++++-------
>   1 file changed, 72 insertions(+), 50 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 49dc78c2f0d7..5c39681c9720 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -383,6 +383,34 @@ static int amdgpu_userq_fence_read_wptr(struct amdgpu_usermode_queue *queue,
>   	return r;
>   }
>   
> +static int amdgpu_userq_exec_lock(struct drm_exec *exec, u32 flags,
> +				  struct drm_gem_object **gobj,
> +				  u32 num_handles, unsigned int num_fences)
> +{
> +	int r;
> +
> +	if (!exec | !gobj)
> +		return -EINVAL;
> +
> +	drm_exec_init(exec, flags, num_handles);
> +
> +	/* Lock all BOs with retry handling */
> +	drm_exec_until_all_locked(exec) {
> +		r = drm_exec_prepare_array(exec, gobj, num_handles, num_fences);
> +		drm_exec_retry_on_contention(exec);
> +		if (r)
> +			drm_exec_fini(exec);

You need a "return r" or break here.

> +	}
> +
> +	return r;
> +}
> +
> +static void amdgpu_userq_exec_unlock(struct drm_exec *exec)
> +{
> +	if (exec)
> +		drm_exec_fini(exec);
> +}
> +
>   int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
>   			      struct drm_file *filp)
>   {
> @@ -476,45 +504,31 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
>   	if (!queue) {
>   		r = -ENOENT;
>   		mutex_unlock(&userq_mgr->userq_mutex);
> -	}
> -
> -	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
> -		      (num_read_bo_handles + num_write_bo_handles));
> -
> -	/* Lock all BOs with retry handling */
> -	drm_exec_until_all_locked(&exec) {
> -		r = drm_exec_prepare_array(&exec, gobj_read, num_read_bo_handles, 1);
> -		drm_exec_retry_on_contention(&exec);
> -		if (r) {
> -			mutex_unlock(&userq_mgr->userq_mutex);
> -			goto exec_fini;
> -		}
> -
> -		r = drm_exec_prepare_array(&exec, gobj_write, num_write_bo_handles, 1);
> -		drm_exec_retry_on_contention(&exec);
> -		if (r) {
> -			mutex_unlock(&userq_mgr->userq_mutex);
> -			goto exec_fini;
> -		}
> +		goto put_gobj_write;
>   	}
>   
>   	r = amdgpu_userq_fence_read_wptr(queue, &wptr);
>   	if (r) {
>   		mutex_unlock(&userq_mgr->userq_mutex);
> -		goto exec_fini;
> +		goto put_gobj_write;
>   	}
>   
>   	/* Create a new fence */
>   	r = amdgpu_userq_fence_create(queue, wptr, &fence);
>   	if (r) {
>   		mutex_unlock(&userq_mgr->userq_mutex);
> -		goto exec_fini;
> +		goto put_gobj_write;
>   	}
>   
>   	dma_fence_put(queue->last_fence);
>   	queue->last_fence = dma_fence_get(fence);
>   	mutex_unlock(&userq_mgr->userq_mutex);
>   
> +	r = amdgpu_userq_exec_lock(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
> +				   gobj_read, num_read_bo_handles, 1);
> +	if (r)
> +		goto put_gobj_write;
> +
>   	for (i = 0; i < num_read_bo_handles; i++) {
>   		if (!gobj_read || !gobj_read[i]->resv)
>   			continue;
> @@ -522,6 +536,12 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
>   		dma_resv_add_fence(gobj_read[i]->resv, fence,
>   				   DMA_RESV_USAGE_READ);
>   	}
> +	amdgpu_userq_exec_unlock(&exec);
> +
> +	r = amdgpu_userq_exec_lock(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
> +				   gobj_write, num_write_bo_handles, 1);
> +	if (r)
> +		goto put_gobj_write;

Clear NAK, that won't work like this.

The old code was actually correct as far as I can see, e.g. you need 
something like

drm_exec_init(&exec);

drm_exec_until_all_locked(&exec) {
     r = drm_exec_prepare_array(&exec, gobj_read, num_read_bo_handles, 1);
     ....
     r = drm_exec_prepare_array(&exec, gobj_write, num_write_bo_handles, 1);
     ....
}

Otherwise you break the drm_exec contention handling.

Regards,
Christian.


>   
>   	for (i = 0; i < num_write_bo_handles; i++) {
>   		if (!gobj_write || !gobj_write[i]->resv)
> @@ -530,6 +550,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
>   		dma_resv_add_fence(gobj_write[i]->resv, fence,
>   				   DMA_RESV_USAGE_WRITE);
>   	}
> +	amdgpu_userq_exec_unlock(&exec);
>   
>   	/* Add the created fence to syncobj/BO's */
>   	for (i = 0; i < num_syncobj_handles; i++)
> @@ -538,8 +559,6 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
>   	/* drop the reference acquired in fence creation function */
>   	dma_fence_put(fence);
>   
> -exec_fini:
> -	drm_exec_fini(&exec);
>   put_gobj_write:
>   	while (wentry-- > 0)
>   		drm_gem_object_put(gobj_write[wentry]);
> @@ -650,26 +669,6 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   		}
>   	}
>   
> -	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
> -		      (num_read_bo_handles + num_write_bo_handles));
> -
> -	/* Lock all BOs with retry handling */
> -	drm_exec_until_all_locked(&exec) {
> -		r = drm_exec_prepare_array(&exec, gobj_read, num_read_bo_handles, 1);
> -		drm_exec_retry_on_contention(&exec);
> -		if (r) {
> -			drm_exec_fini(&exec);
> -			goto put_gobj_write;
> -		}
> -
> -		r = drm_exec_prepare_array(&exec, gobj_write, num_write_bo_handles, 1);
> -		drm_exec_retry_on_contention(&exec);
> -		if (r) {
> -			drm_exec_fini(&exec);
> -			goto put_gobj_write;
> -		}
> -	}
> -
>   	if (!wait_info->num_fences) {
>   		if (num_points) {
>   			struct dma_fence_unwrap iter;
> @@ -682,7 +681,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   							   DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
>   							   &fence);
>   				if (r)
> -					goto exec_fini;
> +					goto put_gobj_write;
>   
>   				dma_fence_unwrap_for_each(f, &iter, fence)
>   					num_fences++;
> @@ -700,12 +699,17 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   						   DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
>   						   &fence);
>   			if (r)
> -				goto exec_fini;
> +				goto put_gobj_write;
>   
>   			num_fences++;
>   			dma_fence_put(fence);
>   		}
>   
> +		r = amdgpu_userq_exec_lock(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
> +					   gobj_read, num_read_bo_handles, 1);
> +		if (r)
> +			goto put_gobj_write;
> +
>   		/* Count GEM objects fence */
>   		for (i = 0; i < num_read_bo_handles; i++) {
>   			struct dma_resv_iter resv_cursor;
> @@ -715,6 +719,12 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   						DMA_RESV_USAGE_READ, fence)
>   				num_fences++;
>   		}
> +		amdgpu_userq_exec_unlock(&exec);
> +
> +		r = amdgpu_userq_exec_lock(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
> +					   gobj_write, num_write_bo_handles, 1);
> +		if (r)
> +			goto put_gobj_write;
>   
>   		for (i = 0; i < num_write_bo_handles; i++) {
>   			struct dma_resv_iter resv_cursor;
> @@ -724,6 +734,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   						DMA_RESV_USAGE_WRITE, fence)
>   				num_fences++;
>   		}
> +		amdgpu_userq_exec_unlock(&exec);
>   
>   		/*
>   		 * Passing num_fences = 0 means that userspace doesn't want to
> @@ -737,7 +748,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   		fence_info = kmalloc_array(wait_info->num_fences, sizeof(*fence_info), GFP_KERNEL);
>   		if (!fence_info) {
>   			r = -ENOMEM;
> -			goto exec_fini;
> +			goto put_gobj_write;
>   		}
>   
>   		/* Array of fences */
> @@ -747,6 +758,11 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   			goto free_fence_info;
>   		}
>   
> +		r = amdgpu_userq_exec_lock(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
> +					   gobj_read, num_read_bo_handles, 1);
> +		if (r)
> +			goto free_fences;
> +
>   		/* Retrieve GEM read objects fence */
>   		for (i = 0; i < num_read_bo_handles; i++) {
>   			struct dma_resv_iter resv_cursor;
> @@ -756,6 +772,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   						DMA_RESV_USAGE_READ, fence) {
>   				if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) {
>   					r = -EINVAL;
> +					amdgpu_userq_exec_unlock(&exec);
>   					goto free_fences;
>   				}
>   
> @@ -763,6 +780,12 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   				dma_fence_get(fence);
>   			}
>   		}
> +		amdgpu_userq_exec_unlock(&exec);
> +
> +		r = amdgpu_userq_exec_lock(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
> +					   gobj_write, num_write_bo_handles, 1);
> +		if (r)
> +			goto free_fences;
>   
>   		/* Retrieve GEM write objects fence */
>   		for (i = 0; i < num_write_bo_handles; i++) {
> @@ -773,6 +796,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   						DMA_RESV_USAGE_WRITE, fence) {
>   				if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) {
>   					r = -EINVAL;
> +					amdgpu_userq_exec_unlock(&exec);
>   					goto free_fences;
>   				}
>   
> @@ -780,6 +804,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   				dma_fence_get(fence);
>   			}
>   		}
> +		amdgpu_userq_exec_unlock(&exec);
>   
>   		if (num_points) {
>   			struct dma_fence_unwrap iter;
> @@ -885,7 +910,6 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   		kfree(fence_info);
>   	}
>   
> -	drm_exec_fini(&exec);
>   	for (i = 0; i < num_read_bo_handles; i++)
>   		drm_gem_object_put(gobj_read[i]);
>   	kfree(gobj_read);
> @@ -908,8 +932,6 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
>   	kfree(fences);
>   free_fence_info:
>   	kfree(fence_info);
> -exec_fini:
> -	drm_exec_fini(&exec);
>   put_gobj_write:
>   	while (wentry-- > 0)
>   		drm_gem_object_put(gobj_write[wentry]);



More information about the amd-gfx mailing list