[PATCH 2/4] drm/amdgpu: refine the GPU recovery sequence
Christian König
christian.koenig at amd.com
Thu Mar 18 07:56:55 UTC 2021
Am 18.03.21 um 08:23 schrieb Dennis Li:
> Changed to only set in_gpu_reset as 1 when the recovery thread begin,
> and delay hold reset_sem after pre-reset but before reset. It make sure
> that other threads have exited or been blocked before doing GPU reset.
> Compared with the old codes, it could make some threads exit more early
> without waiting for timeout.
>
> Introduce a event recovery_fini_event which is used to block new threads
> when recovery thread has begun. These threads are only waked up when recovery
> thread exit.
>
> v2: remove codes to check the usage of adev->reset_sem, because lockdep
> will show all locks held in the system, when system detect hung timeout
> in the recovery thread.
>
> Signed-off-by: Dennis Li <Dennis.Li at amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 02a34f9a26aa..67c716e5ee8d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1044,6 +1044,8 @@ struct amdgpu_device {
> atomic_t in_gpu_reset;
> enum pp_mp1_state mp1_state;
> struct rw_semaphore reset_sem;
> + wait_queue_head_t recovery_fini_event;
> +
> struct amdgpu_doorbell_index doorbell_index;
>
> struct mutex notifier_lock;
> @@ -1406,4 +1408,8 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev)
> {
> return atomic_read(&adev->in_gpu_reset);
> }
> +
> +int amdgpu_read_lock(struct drm_device *dev, bool interruptible);
> +void amdgpu_read_unlock(struct drm_device *dev);
> +
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 24ff5992cb02..15235610cc54 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -211,6 +211,60 @@ static ssize_t amdgpu_device_get_serial_number(struct device *dev,
> static DEVICE_ATTR(serial_number, S_IRUGO,
> amdgpu_device_get_serial_number, NULL);
>
> +int amdgpu_read_lock(struct drm_device *dev, bool interruptible)
> +{
> + struct amdgpu_device *adev = drm_to_adev(dev);
> + int ret = 0;
> +
> + /**
> + * if a thread hold the read lock, but recovery thread has started,
> + * it should release the read lock and wait for recovery thread finished
> + * Because pre-reset functions have begun, which stops old threads but no
> + * include the current thread.
> + */
> + if (interruptible) {
> + while (!(ret = down_read_killable(&adev->reset_sem)) &&
> + amdgpu_in_reset(adev)) {
> + up_read(&adev->reset_sem);
> + ret = wait_event_interruptible(adev->recovery_fini_event,
> + !amdgpu_in_reset(adev));
> + if (ret)
> + break;
> + }
> + } else {
> + down_read(&adev->reset_sem);
> + while (amdgpu_in_reset(adev)) {
> + up_read(&adev->reset_sem);
> + wait_event(adev->recovery_fini_event,
> + !amdgpu_in_reset(adev));
> + down_read(&adev->reset_sem);
> + }
> + }
Ok once more. This general approach is a NAK. We have already tried this
and it doesn't work.
All you do here is to replace the gpu reset lock with
wait_event_interruptible().
From an upstream perspective that is strictly illegal since it will
just prevent lockdep warning from filling the logs and doesn't really
solve any problem.
Regards,
Christian.
> +
> + return ret;
> +}
> +
> +void amdgpu_read_unlock(struct drm_device *dev)
> +{
> + struct amdgpu_device *adev = drm_to_adev(dev);
> +
> + up_read(&adev->reset_sem);
> +}
> +
> +static void amdgpu_write_lock(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +{
> + if (hive) {
> + down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
> + } else {
> + down_write(&adev->reset_sem);
> + }
> +}
> +
> +static void amdgpu_write_unlock(struct amdgpu_device *adev)
> +{
> + up_write(&adev->reset_sem);
> +}
> +
> /**
> * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control
> *
> @@ -3280,6 +3334,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> hash_init(adev->mn_hash);
> atomic_set(&adev->in_gpu_reset, 0);
> init_rwsem(&adev->reset_sem);
> + init_waitqueue_head(&adev->recovery_fini_event);
> mutex_init(&adev->psp.mutex);
> mutex_init(&adev->notifier_lock);
>
> @@ -4509,39 +4564,18 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> return r;
> }
>
> -static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
> - struct amdgpu_hive_info *hive)
> +static bool amdgpu_device_recovery_enter(struct amdgpu_device *adev)
> {
> if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
> return false;
>
> - if (hive) {
> - down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
> - } else {
> - down_write(&adev->reset_sem);
> - }
> -
> - switch (amdgpu_asic_reset_method(adev)) {
> - case AMD_RESET_METHOD_MODE1:
> - adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
> - break;
> - case AMD_RESET_METHOD_MODE2:
> - adev->mp1_state = PP_MP1_STATE_RESET;
> - break;
> - default:
> - adev->mp1_state = PP_MP1_STATE_NONE;
> - break;
> - }
> -
> return true;
> }
>
> -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> +static void amdgpu_device_recovery_exit(struct amdgpu_device *adev)
> {
> - amdgpu_vf_error_trans_all(adev);
> - adev->mp1_state = PP_MP1_STATE_NONE;
> atomic_set(&adev->in_gpu_reset, 0);
> - up_write(&adev->reset_sem);
> + wake_up_interruptible_all(&adev->recovery_fini_event);
> }
>
> /*
> @@ -4550,7 +4584,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> *
> * unlock won't require roll back.
> */
> -static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +static int amdgpu_hive_recovery_enter(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> {
> struct amdgpu_device *tmp_adev = NULL;
>
> @@ -4560,10 +4594,10 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp
> return -ENODEV;
> }
> list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> - if (!amdgpu_device_lock_adev(tmp_adev, hive))
> + if (!amdgpu_device_recovery_enter(tmp_adev))
> goto roll_back;
> }
> - } else if (!amdgpu_device_lock_adev(adev, hive))
> + } else if (!amdgpu_device_recovery_enter(adev))
> return -EAGAIN;
>
> return 0;
> @@ -4578,12 +4612,61 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp
> */
> dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
> list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> - amdgpu_device_unlock_adev(tmp_adev);
> + amdgpu_device_recovery_exit(tmp_adev);
> }
> }
> return -EAGAIN;
> }
>
> +static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
> + struct amdgpu_hive_info *hive)
> +{
> + amdgpu_write_lock(adev, hive);
> +
> + switch (amdgpu_asic_reset_method(adev)) {
> + case AMD_RESET_METHOD_MODE1:
> + adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
> + break;
> + case AMD_RESET_METHOD_MODE2:
> + adev->mp1_state = PP_MP1_STATE_RESET;
> + break;
> + default:
> + adev->mp1_state = PP_MP1_STATE_NONE;
> + break;
> + }
> +}
> +
> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> +{
> + amdgpu_vf_error_trans_all(adev);
> + adev->mp1_state = PP_MP1_STATE_NONE;
> + amdgpu_write_unlock(adev);
> +}
> +
> +/*
> + * to lockup a list of amdgpu devices in a hive safely, if not a hive
> + * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
> + *
> + * unlock won't require roll back.
> + */
> +static void amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +{
> + struct amdgpu_device *tmp_adev = NULL;
> +
> + if (adev->gmc.xgmi.num_physical_nodes > 1) {
> + if (!hive) {
> + dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
> + return;
> + }
> +
> + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> + amdgpu_device_lock_adev(tmp_adev, hive);
> + }
> + } else {
> + amdgpu_device_lock_adev(adev, hive);
> + }
> +}
> +
> static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
> {
> struct pci_dev *p = NULL;
> @@ -4732,6 +4815,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> bool need_emergency_restart = false;
> bool audio_suspended = false;
> int tmp_vram_lost_counter;
> + bool locked = false;
>
> /*
> * Special case: RAS triggered and full reset isn't supported
> @@ -4777,7 +4861,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> * if didn't get the device lock, don't touch the linked list since
> * others may iterating it.
> */
> - r = amdgpu_device_lock_hive_adev(adev, hive);
> + r = amdgpu_hive_recovery_enter(adev, hive);
> if (r) {
> dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> job ? job->base.id : -1);
> @@ -4884,6 +4968,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> }
>
> tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
> + /*
> + * Pre reset functions called before lock, which make sure other threads
> + * who own reset lock exit successfully. No other thread runs in the driver
> + * while the recovery thread runs
> + */
> + if (!locked) {
> + amdgpu_device_lock_hive_adev(adev, hive);
> + locked = true;
> + }
> +
> /* Actual ASIC resets if needed.*/
> /* TODO Implement XGMI hive reset logic for SRIOV */
> if (amdgpu_sriov_vf(adev)) {
> @@ -4955,7 +5049,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>
> if (audio_suspended)
> amdgpu_device_resume_display_audio(tmp_adev);
> - amdgpu_device_unlock_adev(tmp_adev);
> + amdgpu_device_recovery_exit(tmp_adev);
> + if (locked)
> + amdgpu_device_unlock_adev(tmp_adev);
> }
>
> skip_recovery:
> @@ -5199,9 +5295,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
> * Locking adev->reset_sem will prevent any external access
> * to GPU during PCI error recovery
> */
> - while (!amdgpu_device_lock_adev(adev, NULL))
> + while (!amdgpu_device_recovery_enter(adev))
> amdgpu_cancel_all_tdr(adev);
>
> + amdgpu_device_lock_adev(adev, NULL);
> /*
> * Block any work scheduling as we do for regular GPU reset
> * for the duration of the recovery
More information about the amd-gfx
mailing list