[PATCH v2] drm/amdgpu: Add gpu_recovery parameter

Christian König christian.koenig at amd.com
Fri Dec 15 13:19:22 UTC 2017


Am 13.12.2017 um 20:01 schrieb Andrey Grodzovsky:
> Add new parameter to control GPU recovery procedure.
> Retire old way of disabling GPU recovery by setting lockup_timeout == 0 and
> set default for lockup_timeout to 10s.
>
> v2:
> Add auto logic where reset is disabled for bare metal and enabled
> for SR-IOV.
> Allow forced reset from debugfs.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>

Reviewed-by: Christian König <christian.koenig at amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 ++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    | 2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 +-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      | 2 +-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      | 2 +-
>   8 files changed, 19 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 3735500..d7f0263 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -126,6 +126,7 @@ extern int amdgpu_param_buf_per_se;
>   extern int amdgpu_job_hang_limit;
>   extern int amdgpu_lbpw;
>   extern int amdgpu_compute_multipipe;
> +extern int amdgpu_gpu_recovery;
>   
>   #ifdef CONFIG_DRM_AMDGPU_SI
>   extern int amdgpu_si_support;
> @@ -1879,7 +1880,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>   
>   /* Common functions */
> -int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job);
> +int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job, bool force);
>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>   void amdgpu_pci_config_reset(struct amdgpu_device *adev);
>   bool amdgpu_need_post(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 8d03baa..a074502 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3015,11 +3015,12 @@ static int amdgpu_reset_sriov(struct amdgpu_device *adev, uint64_t *reset_flags,
>    *
>    * @adev: amdgpu device pointer
>    * @job: which job trigger hang
> + * @force forces reset regardless of amdgpu_gpu_recovery
>    *
>    * Attempt to reset the GPU if it has hung (all asics).
>    * Returns 0 for success or an error on failure.
>    */
> -int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
> +int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job, bool force)
>   {
>   	struct drm_atomic_state *state = NULL;
>   	uint64_t reset_flags = 0;
> @@ -3030,6 +3031,12 @@ int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
>   		return 0;
>   	}
>   
> +	if (!force && (amdgpu_gpu_recovery == 0 ||
> +			(amdgpu_gpu_recovery == -1  && !amdgpu_sriov_vf(adev)))) {
> +		DRM_INFO("GPU recovery disabled.\n");
> +		return 0;
> +	}
> +
>   	dev_info(adev->dev, "GPU reset begin!\n");
>   
>   	mutex_lock(&adev->lock_reset);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 0b039bd..b734cd6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -128,6 +128,7 @@ int amdgpu_param_buf_per_se = 0;
>   int amdgpu_job_hang_limit = 0;
>   int amdgpu_lbpw = -1;
>   int amdgpu_compute_multipipe = -1;
> +int amdgpu_gpu_recovery = -1; /* auto */
>   
>   MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
>   module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
> @@ -280,6 +281,9 @@ module_param_named(lbpw, amdgpu_lbpw, int, 0444);
>   MODULE_PARM_DESC(compute_multipipe, "Force compute queues to be spread across pipes (1 = enable, 0 = disable, -1 = auto)");
>   module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>   
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto");
> +module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
> +
>   #ifdef CONFIG_DRM_AMDGPU_SI
>   
>   #if defined(CONFIG_DRM_RADEON) || defined(CONFIG_DRM_RADEON_MODULE)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 1469963..854baf0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -705,7 +705,7 @@ static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data)
>   	struct amdgpu_device *adev = dev->dev_private;
>   
>   	seq_printf(m, "gpu recover\n");
> -	amdgpu_gpu_recover(adev, NULL);
> +	amdgpu_gpu_recover(adev, NULL, true);
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index c340774..c43643e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -88,7 +88,7 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work)
>   						  reset_work);
>   
>   	if (!amdgpu_sriov_vf(adev))
> -		amdgpu_gpu_recover(adev, NULL);
> +		amdgpu_gpu_recover(adev, NULL, false);
>   }
>   
>   /* Disable *all* interrupts */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 013c0a8..be8a437 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,7 +37,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		  atomic_read(&job->ring->fence_drv.last_seq),
>   		  job->ring->fence_drv.sync_seq);
>   
> -	amdgpu_gpu_recover(job->adev, job);
> +	amdgpu_gpu_recover(job->adev, job, false);
>   }
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 71f5690..7ade56d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -253,7 +253,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	}
>   
>   	/* Trigger recovery due to world switch failure */
> -	amdgpu_gpu_recover(adev, NULL);
> +	amdgpu_gpu_recover(adev, NULL, false);
>   }
>   
>   static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index df52824..e05823d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>   	}
>   
>   	/* Trigger recovery due to world switch failure */
> -	amdgpu_gpu_recover(adev, NULL);
> +	amdgpu_gpu_recover(adev, NULL, false);
>   }
>   
>   static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,



More information about the amd-gfx mailing list