[PATCH 07/12] drm/amdgpu/sriov:implement strict gpu reset

Mon Oct 9 08:30:10 UTC 2017

why we shouldn't return error code ? any particular reason ?

if we wake up a thread waiting on some fence, and return 0 the UMD won't aware that a gpu hang occurred just now ...

for this strict mode reset the policy is aligned with vulkan spec:
it defines that the vk runtime should return VK_DEVICELOST error to app on those waiting functions, and kernel should let VK UMD aware of it so kernel must return error to UMD in the cs_wait IOCTL

UMD cannot invoke amdgpu_query_ctx from no reason, it should be notified by some error fence like cs_wait

BR Monk

-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken at gmail.com] 
Sent: 2017年10月9日 16:20
To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH 07/12] drm/amdgpu/sriov:implement strict gpu reset

Am 30.09.2017 um 08:03 schrieb Monk Liu:
> changes:
> 1)implement strict mode sriov gpu reset 2)always call 
> sriov_gpu_reset_strict if hypervisor notify FLR 3)in strict reset 
> mode, set error to all fences.
> 4)change fence_wait/cs_wait functions to return -ENODEV if fence 
> signaled with error == -ETIME,
>
> Since after strict gpu reset we consider the VRAM were lost, and since 
> assuming VRAM lost there is little help to recover shadow BO because 
> all textures/resources/shaders cannot recovered (if they resident in 
> VRAM)

NAK, we shouldn't return an error code on the wait function and instead handle the fence error code in amdgpu_ctx_query().

Regards,
Christian.

>
> Change-Id: I50d9b8b5185ba92f137f07c9deeac19d740d753b
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        | 25 ++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 90 +++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |  4 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  6 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |  1 +
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  4 +-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c         |  4 +-
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 60 ++++++++++++++++++
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +
>   10 files changed, 188 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index de11527..de9c164 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -123,6 +123,7 @@ extern int amdgpu_cntl_sb_buf_per_se;
>   extern int amdgpu_param_buf_per_se;
>   extern int amdgpu_job_hang_limit;
>   extern int amdgpu_lbpw;
> +extern int amdgpu_sriov_reset_level;
>   
>   #ifdef CONFIG_DRM_AMDGPU_SI
>   extern int amdgpu_si_support;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index c6a214f..9467cf6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1262,6 +1262,7 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
>   	struct amdgpu_ctx *ctx;
>   	struct dma_fence *fence;
>   	long r;
> +	int fence_err = 0;
>   
>   	if (amdgpu_kms_vram_lost(adev, fpriv))
>   		return -ENODEV;
> @@ -1283,6 +1284,8 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
>   		r = PTR_ERR(fence);
>   	else if (fence) {
>   		r = dma_fence_wait_timeout(fence, true, timeout);
> +		/* gpu hang and this fence is signaled by gpu reset if fence_err < 0 */
> +		fence_err = dma_fence_get_status(fence);
>   		dma_fence_put(fence);
>   	} else
>   		r = 1;
> @@ -1292,7 +1295,10 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
>   		return r;
>   
>   	memset(wait, 0, sizeof(*wait));
> -	wait->out.status = (r == 0);
> +	wait->out.status = (fence_err < 0);
> +
> +	if (fence_err < 0)
> +		return -ENODEV;
>   
>   	return 0;
>   }
> @@ -1346,6 +1352,7 @@ static int amdgpu_cs_wait_all_fences(struct amdgpu_device *adev,
>   	uint32_t fence_count = wait->in.fence_count;
>   	unsigned int i;
>   	long r = 1;
> +	int fence_err = 0;
>   
>   	for (i = 0; i < fence_count; i++) {
>   		struct dma_fence *fence;
> @@ -1358,16 +1365,20 @@ static int amdgpu_cs_wait_all_fences(struct amdgpu_device *adev,
>   			continue;
>   
>   		r = dma_fence_wait_timeout(fence, true, timeout);
> +		fence_err = dma_fence_get_status(fence);
>   		dma_fence_put(fence);
>   		if (r < 0)
>   			return r;
>   
> -		if (r == 0)
> +		if (r == 0 || fence_err < 0)
>   			break;
>   	}
>   
>   	memset(wait, 0, sizeof(*wait));
> -	wait->out.status = (r > 0);
> +	wait->out.status = (r > 0 && fence_err == 0);
> +
> +	if (fence_err < 0)
> +		return -ENODEV;
>   
>   	return 0;
>   }
> @@ -1391,6 +1402,7 @@ static int amdgpu_cs_wait_any_fence(struct amdgpu_device *adev,
>   	struct dma_fence **array;
>   	unsigned int i;
>   	long r;
> +	int fence_err = 0;
>   
>   	/* Prepare the fence array */
>   	array = kcalloc(fence_count, sizeof(struct dma_fence *), 
> GFP_KERNEL); @@ -1418,10 +1430,12 @@ static int amdgpu_cs_wait_any_fence(struct amdgpu_device *adev,
>   				       &first);
>   	if (r < 0)
>   		goto err_free_fence_array;
> +	else
> +		fence_err = dma_fence_get_status(array[first]);
>   
>   out:
>   	memset(wait, 0, sizeof(*wait));
> -	wait->out.status = (r > 0);
> +	wait->out.status = (r > 0 && fence_err == 0);
>   	wait->out.first_signaled = first;
>   	/* set return value 0 to indicate success */
>   	r = 0;
> @@ -1431,6 +1445,9 @@ static int amdgpu_cs_wait_any_fence(struct amdgpu_device *adev,
>   		dma_fence_put(array[i]);
>   	kfree(array);
>   
> +	if (fence_err < 0)
> +		return -ENODEV;
> +
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 9efbb33..122e2e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2734,6 +2734,96 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
>   }
>   
>   /**
> + * amdgpu_sriov_gpu_reset_strict - reset the asic under strict mode
> + *
> + * @adev: amdgpu device pointer
> + * @job: which job trigger hang
> + *
> + * Attempt the reset the GPU if it has hung (all asics).
> + * for SRIOV case.
> + * Returns 0 for success or an error on failure.
> + *
> + * this function will deny all process/fence created before this 
> +reset,
> + * and drop all jobs unfinished during this reset.
> + *
> + * Application should take the responsibility to re-open the FD to 
> +re-create
> + * the VM page table and recover all resources as well
> + *
> + **/
> +int amdgpu_sriov_gpu_reset_strict(struct amdgpu_device *adev, struct 
> +amdgpu_job *job) {
> +	int i, r = 0;
> +	int resched;
> +	struct amdgpu_ring *ring;
> +
> +	/* other thread is already into the gpu reset so just quit and come later */
> +	if (!atomic_add_unless(&adev->in_sriov_reset, 1, 1))
> +		return -EAGAIN;
> +
> +	atomic_inc(&adev->gpu_reset_counter);
> +
> +	/* block TTM */
> +	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> +
> +	/* fake signal jobs already scheduled  */
> +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +		ring = adev->rings[i];
> +
> +		if (!ring || !ring->sched.thread)
> +			continue;
> +
> +		kthread_park(ring->sched.thread);
> +		amd_sched_set_sched_hang(&ring->sched);
> +		amdgpu_fence_driver_force_completion_ring(ring);
> +		amd_sched_set_queue_hang(&ring->sched);
> +	}
> +
> +	/* request to take full control of GPU before re-initialization  */
> +	if (job)
> +		amdgpu_virt_reset_gpu(adev);
> +	else
> +		amdgpu_virt_request_full_gpu(adev, true);
> +
> +	/* Resume IP prior to SMC */
> +	amdgpu_sriov_reinit_early(adev);
> +
> +	/* we need recover gart prior to run SMC/CP/SDMA resume */
> +	amdgpu_ttm_recover_gart(adev);
> +
> +	/* now we are okay to resume SMC/CP/SDMA */
> +	amdgpu_sriov_reinit_late(adev);
> +
> +	/* resume IRQ status */
> +	amdgpu_irq_gpu_reset_resume_helper(adev);
> +
> +	if (amdgpu_ib_ring_tests(adev))
> +		dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
> +
> +	/* release full control of GPU after ib test */
> +	amdgpu_virt_release_full_gpu(adev, true);
> +
> +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +		ring = adev->rings[i];
> +
> +		if (!ring || !ring->sched.thread)
> +			continue;
> +
> +		kthread_unpark(ring->sched.thread);
> +	}
> +
> +	drm_helper_resume_force_mode(adev->ddev);
> +
> +	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
> +	if (r)
> +		dev_info(adev->dev, "Strict mode GPU reset failed\n");
> +	else
> +		dev_info(adev->dev, "Strict mode GPU reset successed!\n");
> +
> +	atomic_set(&adev->in_sriov_reset, 0);
> +	return 0;
> +}
> +
> +/**
>    * amdgpu_sriov_gpu_reset - reset the asic
>    *
>    * @adev: amdgpu device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 8f5211c..eee67dc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -123,6 +123,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;
>   int amdgpu_param_buf_per_se = 0;
>   int amdgpu_job_hang_limit = 0;
>   int amdgpu_lbpw = -1;
> +int amdgpu_sriov_reset_level = 0;
>   
>   MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
>   module_param_named(vramlimit, amdgpu_vram_limit, int, 0600); @@ 
> -269,6 +270,9 @@ module_param_named(job_hang_limit, amdgpu_job_hang_limit, int ,0444);
>   MODULE_PARM_DESC(lbpw, "Load Balancing Per Watt (LBPW) support (1 = enable, 0 = disable, -1 = auto)");
>   module_param_named(lbpw, amdgpu_lbpw, int, 0444);
>   
> +MODULE_PARM_DESC(sriov_reset_level, "what level will gpu reset on, 0: 
> +loose, 1:strict, other:disable (default 0))"); 
> +module_param_named(sriov_reset_level, amdgpu_sriov_reset_level, int 
> +,0444);
> +
>   #ifdef CONFIG_DRM_AMDGPU_SI
>   
>   #if defined(CONFIG_DRM_RADEON) || defined(CONFIG_DRM_RADEON_MODULE) 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 0db81a4..933823a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -41,7 +41,11 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job)
>   		int r;
>   
>   try_again:
> -		r = amdgpu_sriov_gpu_reset(job->adev, job);
> +		if (amdgpu_sriov_reset_level == 1)
> +			r = amdgpu_sriov_gpu_reset_strict(job->adev, job);
> +		else
> +			r = amdgpu_sriov_gpu_reset(job->adev, job);
> +
>   		if (r == -EAGAIN) {
>   			/* maye two different schedulers all have hang job, try later */
>   			schedule();
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index a3cbd5a..5664a10 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -100,5 +100,6 @@ int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>   int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job);
>   int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
>   void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
> +int amdgpu_sriov_gpu_reset_strict(struct amdgpu_device *adev, struct 
> +amdgpu_job *job);
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 2812d88..00a9629 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -247,8 +247,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   		return;
>   	}
>   
> -	/* Trigger recovery due to world switch failure */
> -	amdgpu_sriov_gpu_reset(adev, NULL);
> +	/* use strict mode if FLR triggered from hypervisor */
> +	amdgpu_sriov_gpu_reset_strict(adev, NULL);
>   }
>   
>   static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index c25a831..c94b6e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -513,8 +513,8 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>   		return;
>   	}
>   
> -	/* Trigger recovery due to world switch failure */
> -	amdgpu_sriov_gpu_reset(adev, NULL);
> +	/* use strict mode if FLR triggered from hypervisor */
> +	amdgpu_sriov_gpu_reset_strict(adev, NULL);
>   }
>   
>   static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, 
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 97c94f9..12c3092 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -430,6 +430,66 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
>   	spin_unlock(&sched->job_list_lock);
>   }
>   
> +/**
> + * amd_sched_set_sched_hang
> + * @sched: the scheduler need to set all pending jobs hang
> + *
> + * this routine set all unfinished jobs pending in the sched to
> + * an error -ETIME statues
> + *
> + **/
> +void amd_sched_set_sched_hang(struct amd_gpu_scheduler *sched) {
> +	struct amd_sched_job *s_job;
> +
> +	spin_lock(&sched->job_list_lock);
> +	list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node)
> +		dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
> +
> +	spin_unlock(&sched->job_list_lock);
> +}
> +
> +/**
> + * amd_sched_set_queue_hang
> + * @sched: the scheduler need to set all job in kfifo hang
> + *
> + * this routine set all jobs in the KFIFO of @sched to an error
> + * -ETIME status and signal those jobs.
> + *
> + **/
> +
> +void amd_sched_set_queue_hang(struct amd_gpu_scheduler *sched) {
> +	struct amd_sched_entity *entity, *tmp;
> +	struct amd_sched_job *s_job;
> +	struct amd_sched_rq *rq;
> +	int i;
> +
> +	/* set HANG status on all jobs queued and fake signal them */
> +	for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++) {
> +		rq = &sched->sched_rq[i];
> +
> +		spin_lock(&rq->lock);
> +		list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
> +			if (entity->dependency) {
> +				dma_fence_remove_callback(entity->dependency, &entity->cb);
> +				dma_fence_put(entity->dependency);
> +				entity->dependency = NULL;
> +			}
> +
> +			spin_lock(&entity->queue_lock);
> +			while(kfifo_out(&entity->job_queue, &s_job, sizeof(s_job)) == sizeof(s_job)) {
> +				dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
> +				amd_sched_fence_scheduled(s_job->s_fence);
> +				amd_sched_fence_finished(s_job->s_fence);
> +			}
> +			spin_unlock(&entity->queue_lock);
> +		}
> +		spin_unlock(&rq->lock);
> +	}
> +	wake_up(&sched->job_scheduled);
> +}
> +
>   void amd_sched_job_kickout(struct amd_sched_job *s_job)
>   {
>   	struct amd_gpu_scheduler *sched = s_job->sched; diff --git 
> a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index f9d8f28..f0242aa 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -167,4 +167,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
>   bool amd_sched_dependency_optimized(struct dma_fence* fence,
>   				    struct amd_sched_entity *entity);
>   void amd_sched_job_kickout(struct amd_sched_job *s_job);
> +void amd_sched_set_queue_hang(struct amd_gpu_scheduler *sched); void 
> +amd_sched_set_sched_hang(struct amd_gpu_scheduler *sched);
>   #endif