[PATCH v8] drm/amd/amdgpu:Fix compute ring unable to detect hang.

Christian König ckoenig.leichtzumerken at gmail.com
Fri Sep 20 14:29:10 UTC 2019


Am 20.09.19 um 08:57 schrieb Jesse Zhang:
> When compute fence did not signal, compute ring cannot detect hardware hang
> because its timeout value is set to be infinite by default.
>
> In SR-IOV and passthrough mode, if user does not declare custome timeout
> value for compute ring, then use gfx ring timeout value as default. So
> that when there is a ture hardware hang, compute ring can detect it.
>
> Change-Id: I794ec0868c6c0aad407749457260ecfee0617c10
> Signed-off-by: Jesse Zhang <zhexi.zhang at amd.com>

Reviewed-by: Christian König <christian.koenig at amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 ++++++------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  7 ++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +------------
>   3 files changed, 13 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3b5282b..03ac5a1da 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1024,12 +1024,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
>   
>   	amdgpu_device_check_block_size(adev);
>   
> -	ret = amdgpu_device_get_job_timeout_settings(adev);
> -	if (ret) {
> -		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
> -		return ret;
> -	}
> -
>   	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
>   
>   	return ret;
> @@ -2732,6 +2726,12 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	if (r)
>   		return r;
>   
> +	r = amdgpu_device_get_job_timeout_settings(adev);
> +	if (r) {
> +		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
> +		return r;
> +	}
> +
>   	/* doorbell bar mapping and doorbell index init*/
>   	amdgpu_device_doorbell_init(adev);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 420888e..98be49b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -1338,10 +1338,15 @@ int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
>   	/*
>   	 * By default timeout for non compute jobs is 10000.
>   	 * And there is no timeout enforced on compute jobs.
> +	 * In SR-IOV or passthrough mode, timeout for compute
> +	 * jobs are 10000 by default.
>   	 */
>   	adev->gfx_timeout = msecs_to_jiffies(10000);
>   	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
> -	adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
> +	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
> +		adev->compute_timeout = adev->gfx_timeout;
> +	else
> +		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
>   
>   	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) {
>   		while ((timeout_setting = strsep(&input, ",")) &&
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index cbcaa7c..9ef53ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -460,18 +460,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   			timeout = adev->gfx_timeout;
>   			break;
>   		case AMDGPU_RING_TYPE_COMPUTE:
> -			/*
> -			 * For non-sriov case, no timeout enforce
> -			 * on compute ring by default. Unless user
> -			 * specifies a timeout for compute ring.
> -			 *
> -			 * For sriov case, always use the timeout
> -			 * as gfx ring
> -			 */
> -			if (!amdgpu_sriov_vf(ring->adev))
> -				timeout = adev->compute_timeout;
> -			else
> -				timeout = adev->gfx_timeout;
> +			timeout = adev->compute_timeout;
>   			break;
>   		case AMDGPU_RING_TYPE_SDMA:
>   			timeout = adev->sdma_timeout;



More information about the amd-gfx mailing list