[PATCH] drm/amdgpu: introduce vram lost paramter for reset function

Monk Liu Monk.Liu at amd.com
Fri Aug 23 03:34:09 UTC 2019


for SOC15/vega10 the BACO reset would introduce vram lost in
the high end address range and current kmd's vram lost
checking cannot catch it since it only check visible frame buffer

TODO:
to confirm if mode1/2 reset would introduce vram lost

Signed-off-by: Monk Liu <Monk.Liu at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----
 drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-
 drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---
 drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-
 drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-
 drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-
 7 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f6ae565..1fe3756 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {
 	int (*read_register)(struct amdgpu_device *adev, u32 se_num,
 			     u32 sh_num, u32 reg_offset, u32 *value);
 	void (*set_vga_state)(struct amdgpu_device *adev, bool state);
-	int (*reset)(struct amdgpu_device *adev);
+	int (*reset)(struct amdgpu_device *adev, bool *lost);
 	enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);
 	/* get the reference clock */
 	u32 (*get_xclk)(struct amdgpu_device *adev);
@@ -1136,7 +1136,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
  * ASICs macro.
  */
 #define amdgpu_asic_set_vga_state(adev, state) (adev)->asic_funcs->set_vga_state((adev), (state))
-#define amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))
+#define amdgpu_asic_reset(adev, lost) (adev)->asic_funcs->reset((adev), (lost))
 #define amdgpu_asic_reset_method(adev) (adev)->asic_funcs->reset_method((adev))
 #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))
 #define amdgpu_asic_set_uvd_clocks(adev, v, d) (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 02b3e7d..8668cb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 	struct amdgpu_device *adev =
 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
 
-	adev->asic_reset_res =  amdgpu_asic_reset(adev);
+	adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);
 	if (adev->asic_reset_res)
 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 			 adev->asic_reset_res, adev->ddev->unique);
@@ -2751,7 +2751,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	 *  E.g., driver was not cleanly unloaded previously, etc.
 	 */
 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-		r = amdgpu_asic_reset(adev);
+		r = amdgpu_asic_reset(adev, NULL);
 		if (r) {
 			dev_err(adev->dev, "asic reset on init failed\n");
 			goto failed;
@@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)
 		pci_disable_device(dev->pdev);
 		pci_set_power_state(dev->pdev, PCI_D3hot);
 	} else {
-		r = amdgpu_asic_reset(adev);
+		r = amdgpu_asic_reset(adev, NULL);
 		if (r)
 			DRM_ERROR("amdgpu asic reset failed\n");
 	}
@@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
-				r = amdgpu_asic_reset(tmp_adev);
+				r = amdgpu_asic_reset(tmp_adev, &vram_lost);
 
 			if (r) {
 				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
@@ -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 				if (r)
 					goto out;
 
-				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
+				if (!vram_lost)
+					vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
+
 				if (vram_lost) {
 					DRM_INFO("VRAM is lost due to GPU reset!\n");
 					atomic_inc(&tmp_adev->vram_lost_counter);
diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c b/drivers/gpu/drm/amd/amdgpu/cik.c
index 7b63d7a..0f25b82 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik.c
@@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct amdgpu_device *adev)
  * to reset them.
  * Returns 0 for success.
  */
-static int cik_asic_reset(struct amdgpu_device *adev)
+static int cik_asic_reset(struct amdgpu_device *adev, bool *vramlost)
 {
 	int r;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index a3d99f2..53de7a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)
 		return AMD_RESET_METHOD_MODE1;
 }
 
-static int nv_asic_reset(struct amdgpu_device *adev)
+static int nv_asic_reset(struct amdgpu_device *adev, bool *vramlost)
 {
 
 	/* FIXME: it doesn't work since vega10 */
@@ -315,10 +315,14 @@ static int nv_asic_reset(struct amdgpu_device *adev)
 	int ret = 0;
 	struct smu_context *smu = &adev->smu;
 
-	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
+	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+		if (vramlost)
+			*vramlost = true;
 		ret = smu_baco_reset(smu);
-	else
+	}
+	else {
 		ret = nv_asic_mode1_reset(adev);
+	}
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/si.c b/drivers/gpu/drm/amd/amdgpu/si.c
index 9043614..f324099 100644
--- a/drivers/gpu/drm/amd/amdgpu/si.c
+++ b/drivers/gpu/drm/amd/amdgpu/si.c
@@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device *adev,
 }
 
 //xxx: not implemented
-static int si_asic_reset(struct amdgpu_device *adev)
+static int si_asic_reset(struct amdgpu_device *adev, bool *vramlost)
 {
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index fe2212df..12b2966 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
 		return AMD_RESET_METHOD_MODE1;
 }
 
-static int soc15_asic_reset(struct amdgpu_device *adev)
+static int soc15_asic_reset(struct amdgpu_device *adev, bool *vramlost)
 {
 	switch (soc15_asic_reset_method(adev)) {
 		case AMD_RESET_METHOD_BACO:
+			if (vramlost)
+				*vramlost = true;
 			return soc15_asic_baco_reset(adev);
 		case AMD_RESET_METHOD_MODE2:
 			return soc15_mode2_reset(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
index 56c882b..8eceb00 100644
--- a/drivers/gpu/drm/amd/amdgpu/vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/vi.c
@@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device *adev)
  * to reset them.
  * Returns 0 for success.
  */
-static int vi_asic_reset(struct amdgpu_device *adev)
+static int vi_asic_reset(struct amdgpu_device *adev, bool *vramlost)
 {
 	int r;
 
-- 
2.7.4



More information about the amd-gfx mailing list