[PATCH v8] drm/amd/amdgpu:Fix compute ring unable to detect hang.

Jesse Zhang zhexi.zhang at amd.com
Fri Sep 20 06:57:17 UTC 2019


When compute fence did not signal, compute ring cannot detect hardware hang
because its timeout value is set to be infinite by default.

In SR-IOV and passthrough mode, if user does not declare custome timeout
value for compute ring, then use gfx ring timeout value as default. So
that when there is a ture hardware hang, compute ring can detect it.

Change-Id: I794ec0868c6c0aad407749457260ecfee0617c10
Signed-off-by: Jesse Zhang <zhexi.zhang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 ++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  7 ++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +------------
 3 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3b5282b..03ac5a1da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1024,12 +1024,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
 
 	amdgpu_device_check_block_size(adev);
 
-	ret = amdgpu_device_get_job_timeout_settings(adev);
-	if (ret) {
-		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
-		return ret;
-	}
-
 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
 
 	return ret;
@@ -2732,6 +2726,12 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	if (r)
 		return r;
 
+	r = amdgpu_device_get_job_timeout_settings(adev);
+	if (r) {
+		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
+		return r;
+	}
+
 	/* doorbell bar mapping and doorbell index init*/
 	amdgpu_device_doorbell_init(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 420888e..98be49b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1338,10 +1338,15 @@ int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
 	/*
 	 * By default timeout for non compute jobs is 10000.
 	 * And there is no timeout enforced on compute jobs.
+	 * In SR-IOV or passthrough mode, timeout for compute
+	 * jobs are 10000 by default.
 	 */
 	adev->gfx_timeout = msecs_to_jiffies(10000);
 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
-	adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
+	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
+		adev->compute_timeout = adev->gfx_timeout;
+	else
+		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
 
 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) {
 		while ((timeout_setting = strsep(&input, ",")) &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index cbcaa7c..9ef53ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -460,18 +460,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 			timeout = adev->gfx_timeout;
 			break;
 		case AMDGPU_RING_TYPE_COMPUTE:
-			/*
-			 * For non-sriov case, no timeout enforce
-			 * on compute ring by default. Unless user
-			 * specifies a timeout for compute ring.
-			 *
-			 * For sriov case, always use the timeout
-			 * as gfx ring
-			 */
-			if (!amdgpu_sriov_vf(ring->adev))
-				timeout = adev->compute_timeout;
-			else
-				timeout = adev->gfx_timeout;
+			timeout = adev->compute_timeout;
 			break;
 		case AMDGPU_RING_TYPE_SDMA:
 			timeout = adev->sdma_timeout;
-- 
2.7.4



More information about the amd-gfx mailing list