[PATCH v2] drm/amdgpu: Start using amdgpu_ras_add_bad_pages
Andrey Grodzovsky
andrey.grodzovsky at amd.com
Tue Aug 13 17:59:11 UTC 2019
v2: Trigger GPU reset in case of new bad address errors.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
---
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 27 +++++++++++++++++++++++++--
1 file changed, 25 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 296e2d9..f5f36ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -243,17 +243,40 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
struct ras_err_data *err_data,
struct amdgpu_iv_entry *entry)
{
+ unsigned long new_err_addr_cnt, old_err_addr_cnt;
+ new_err_addr_cnt = 0;
+ old_err_addr_cnt = err_data->err_addr_cnt;
+
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (adev->umc.funcs->query_ras_error_count)
adev->umc.funcs->query_ras_error_count(adev, err_data);
/* umc query_ras_error_address is also responsible for clearing
* error status
*/
- if (adev->umc.funcs->query_ras_error_address)
+ if (adev->umc.funcs->query_ras_error_address) {
+ unsigned long *bps;
+ int i;
+
adev->umc.funcs->query_ras_error_address(adev, err_data);
+ new_err_addr_cnt = err_data->err_addr_cnt - old_err_addr_cnt;
+
+ if (new_err_addr_cnt) {
+ bps = kcalloc(new_err_addr_cnt, sizeof(*bps), GFP_KERNEL);
+ if (!bps)
+ return -ENOMEM;
+
+ for (i = 0; i < new_err_addr_cnt; i++)
+ bps[i] = err_data->err_addr[old_err_addr_cnt + i] >> PAGE_SHIFT;
+
+ amdgpu_ras_add_bad_pages(adev, bps, new_err_addr_cnt);
+
+ kfree(bps);
+ }
+ }
+
/* only uncorrectable error needs gpu reset */
- if (err_data->ue_count)
+ if (err_data->ue_count || new_err_addr_cnt)
amdgpu_ras_reset_gpu(adev, 0);
return AMDGPU_RAS_SUCCESS;
--
2.7.4
More information about the amd-gfx
mailing list