[PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place

Mon Sep 2 02:58:38 UTC 2019


> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>
> Sent: 2019年8月30日 22:03
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org;
> Chen, Guchun <Guchun.Chen at amd.com>; Li, Dennis <Dennis.Li at amd.com>;
> Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: Re: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and
> bad page reserve to proper place
> 
> 
> On 8/30/19 8:24 AM, Tao Zhou wrote:
> > ras recovery_init should be called after ttm and smu init, bad page
> > reserve should be put in front of gpu reset since i2c may be unstable
> > during gpu reset add cleanup for recovery_init and recovery_fini
> >
> > Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++-------
> --
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++
> >   3 files changed, 33 insertions(+), 16 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 67b09cb2a9e2..4e4895ac926d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device
> *adev,
> >   		goto failed;
> >   	}
> >
> > +	/* retired pages will be loaded from eeprom and reserved here,
> > +	 * new bo may be created, it should be called after ttm init,
> > +	 * accessing eeprom also relies on smu (unlock i2c bus) and it
> > +	 * should be called after smu init
> > +	 *
> > +	 * Note: theoretically, this should be called before all vram allocations
> > +	 * to protect retired page from abusing, but there are some
> allocations
> > +	 * in front of smu fw loading
> > +	 */
> > +	amdgpu_ras_recovery_init(adev);
> 
> 
> You probably should check for return value here.
[Tao] No check here is intentional, the failure of recovery_init should not stop amdgpu init process and recovery_init could free resources allocated by itself if it fails. I'll add comment here and add a DRM_WARN for recovery_init.

> 
> 
> > +
> >   	/* must succeed. */
> >   	amdgpu_ras_resume(adev);
> >
> > @@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct
> amdgpu_hive_info *hive,
> >   						break;
> >   				}
> >   			}
> > -
> > -			list_for_each_entry(tmp_adev, device_list_handle,
> > -					gmc.xgmi.head) {
> > -				amdgpu_ras_reserve_bad_pages(tmp_adev);
> > -			}
> >   		}
> >   	}
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 02120aa3cb5d..ad67a109122f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -1477,14 +1477,13 @@ static int
> amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
> >   	return 0;
> >   }
> >
> > -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> >   {
> >   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> >   	struct ras_err_handler_data **data = &con->eh_data;
> >   	int ret;
> >
> > -	*data = kmalloc(sizeof(**data),
> > -			GFP_KERNEL|__GFP_ZERO);
> > +	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
> >   	if (!*data)
> >   		return -ENOMEM;
> >
> > @@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct
> > amdgpu_device *adev)
> >
> >   	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras-
> >eeprom_control);
> >   	if (ret)
> > -		return ret;
> > +		goto cancel_work;
> 
> 
> Why you need do go to cancel_work_sync(&con->recovery_work) at such
> early stage of device init, is RAS active already by this time and RAS interrupt
> might fire triggering reset  ?
> 
> Andrey
> 
[Tao] Good point, I'll remove cancel_work_sync.

> 
> >
> >   	if (adev->psp.ras.ras->eeprom_control.num_recs) {
> >   		ret = amdgpu_ras_load_bad_pages(adev);
> >   		if (ret)
> > -			return ret;
> > +			goto cancel_work;
> >   		ret = amdgpu_ras_reserve_bad_pages(adev);
> >   		if (ret)
> > -			return ret;
> > +			goto release;
> >   	}
> >
> >   	return 0;
> > +
> > +release:
> > +	amdgpu_ras_release_bad_pages(adev);
> > +cancel_work:
> > +	cancel_work_sync(&con->recovery_work);
> > +	con->eh_data = NULL;
> > +	kfree((*data)->bps);
> > +	kfree((*data)->bps_bo);
> > +	kfree(*data);
> > +
> > +	return ret;
> >   }
> >
> >   static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@
> > -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct
> amdgpu_device *adev)
> >   	mutex_lock(&con->recovery_lock);
> >   	con->eh_data = NULL;
> >   	kfree(data->bps);
> > +	kfree(data->bps_bo);
> >   	kfree(data);
> >   	mutex_unlock(&con->recovery_lock);
> >
> > @@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
> >   			return r;
> >   	}
> >
> > -	if (amdgpu_ras_recovery_init(adev))
> > -		goto recovery_out;
> > -
> >   	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
> >
> >   	if (amdgpu_ras_fs_init(adev))
> > @@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
> >   			con->hw_supported, con->supported);
> >   	return 0;
> >   fs_out:
> > -	amdgpu_ras_recovery_fini(adev);
> > -recovery_out:
> >   	amdgpu_ras_set_context(adev, NULL);
> >   	kfree(con);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index 42e1d379e21c..6d00f79b612b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct
> amdgpu_device *adev,
> >   	return ras && (ras->supported & (1 << block));
> >   }
> >
> > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
> >   int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
> >   		unsigned int block);
> >
> > @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct
> amdgpu_device *adev,
> >   {
> >   	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> >
> > +	/* save bad page to eeprom before gpu reset,
> > +	 * i2c may be unstable in gpu reset
> > +	 */
> > +	amdgpu_ras_reserve_bad_pages(adev);
> >   	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
> >   		schedule_work(&ras->recovery_work);
> >   	return 0;