[PATCH 2/4] drm/amdgpu: refine the GPU recovery sequence

Thu Mar 18 07:56:55 UTC 2021

Am 18.03.21 um 08:23 schrieb Dennis Li:
> Changed to only set in_gpu_reset as 1 when the recovery thread begin,
> and delay hold reset_sem after pre-reset but before reset. It make sure
> that other threads have exited or been blocked before doing GPU reset.
> Compared with the old codes, it could make some threads exit more early
> without waiting for timeout.
>
> Introduce a event recovery_fini_event which is used to block new threads
> when recovery thread has begun. These threads are only waked up when recovery
> thread exit.
>
> v2: remove codes to check the usage of adev->reset_sem, because lockdep
> will show all locks held in the system, when system detect hung timeout
> in the recovery thread.
>
> Signed-off-by: Dennis Li <Dennis.Li at amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 02a34f9a26aa..67c716e5ee8d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1044,6 +1044,8 @@ struct amdgpu_device {
>   	atomic_t 			in_gpu_reset;
>   	enum pp_mp1_state               mp1_state;
>   	struct rw_semaphore reset_sem;
> +	wait_queue_head_t recovery_fini_event;
> +
>   	struct amdgpu_doorbell_index doorbell_index;
>   
>   	struct mutex			notifier_lock;
> @@ -1406,4 +1408,8 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev)
>   {
>   	return atomic_read(&adev->in_gpu_reset);
>   }
> +
> +int amdgpu_read_lock(struct drm_device *dev, bool interruptible);
> +void amdgpu_read_unlock(struct drm_device *dev);
> +
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 24ff5992cb02..15235610cc54 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -211,6 +211,60 @@ static ssize_t amdgpu_device_get_serial_number(struct device *dev,
>   static DEVICE_ATTR(serial_number, S_IRUGO,
>   		amdgpu_device_get_serial_number, NULL);
>   
> +int amdgpu_read_lock(struct drm_device *dev, bool interruptible)
> +{
> +	struct amdgpu_device *adev = drm_to_adev(dev);
> +	int ret = 0;
> +
> +	/**
> +	 * if a thread hold the read lock, but recovery thread has started,
> +	 * it should release the read lock and wait for recovery thread finished
> +	 * Because pre-reset functions have begun, which stops old threads but no
> +	 * include the current thread.
> +	*/
> +	if (interruptible) {
> +		while (!(ret = down_read_killable(&adev->reset_sem)) &&
> +			amdgpu_in_reset(adev)) {
> +			up_read(&adev->reset_sem);
> +			ret = wait_event_interruptible(adev->recovery_fini_event,
> +							!amdgpu_in_reset(adev));
> +			if (ret)
> +				break;
> +		}
> +	} else {
> +		down_read(&adev->reset_sem);
> +		while (amdgpu_in_reset(adev)) {
> +			up_read(&adev->reset_sem);
> +			wait_event(adev->recovery_fini_event,
> +				   !amdgpu_in_reset(adev));
> +			down_read(&adev->reset_sem);
> +		}
> +	}

Ok once more. This general approach is a NAK. We have already tried this 
and it doesn't work.

All you do here is to replace the gpu reset lock with 
wait_event_interruptible().

 From an upstream perspective that is strictly illegal since it will 
just prevent lockdep warning from filling the logs and doesn't really 
solve any problem.

Regards,
Christian.

> +
> +	return ret;
> +}
> +
> +void amdgpu_read_unlock(struct drm_device *dev)
> +{
> +	struct amdgpu_device *adev = drm_to_adev(dev);
> +
> +	up_read(&adev->reset_sem);
> +}
> +
> +static void amdgpu_write_lock(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +{
> +	if (hive) {
> +		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
> +	} else {
> +		down_write(&adev->reset_sem);
> +	}
> +}
> +
> +static void amdgpu_write_unlock(struct amdgpu_device *adev)
> +{
> +	up_write(&adev->reset_sem);
> +}
> +
>   /**
>    * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control
>    *
> @@ -3280,6 +3334,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	hash_init(adev->mn_hash);
>   	atomic_set(&adev->in_gpu_reset, 0);
>   	init_rwsem(&adev->reset_sem);
> +	init_waitqueue_head(&adev->recovery_fini_event);
>   	mutex_init(&adev->psp.mutex);
>   	mutex_init(&adev->notifier_lock);
>   
> @@ -4509,39 +4564,18 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   	return r;
>   }
>   
> -static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
> -				struct amdgpu_hive_info *hive)
> +static bool amdgpu_device_recovery_enter(struct amdgpu_device *adev)
>   {
>   	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
>   		return false;
>   
> -	if (hive) {
> -		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
> -	} else {
> -		down_write(&adev->reset_sem);
> -	}
> -
> -	switch (amdgpu_asic_reset_method(adev)) {
> -	case AMD_RESET_METHOD_MODE1:
> -		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
> -		break;
> -	case AMD_RESET_METHOD_MODE2:
> -		adev->mp1_state = PP_MP1_STATE_RESET;
> -		break;
> -	default:
> -		adev->mp1_state = PP_MP1_STATE_NONE;
> -		break;
> -	}
> -
>   	return true;
>   }
>   
> -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> +static void amdgpu_device_recovery_exit(struct amdgpu_device *adev)
>   {
> -	amdgpu_vf_error_trans_all(adev);
> -	adev->mp1_state = PP_MP1_STATE_NONE;
>   	atomic_set(&adev->in_gpu_reset, 0);
> -	up_write(&adev->reset_sem);
> +	wake_up_interruptible_all(&adev->recovery_fini_event);
>   }
>   
>   /*
> @@ -4550,7 +4584,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>    *
>    * unlock won't require roll back.
>    */
> -static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +static int amdgpu_hive_recovery_enter(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
>   {
>   	struct amdgpu_device *tmp_adev = NULL;
>   
> @@ -4560,10 +4594,10 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp
>   			return -ENODEV;
>   		}
>   		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> -			if (!amdgpu_device_lock_adev(tmp_adev, hive))
> +			if (!amdgpu_device_recovery_enter(tmp_adev))
>   				goto roll_back;
>   		}
> -	} else if (!amdgpu_device_lock_adev(adev, hive))
> +	} else if (!amdgpu_device_recovery_enter(adev))
>   		return -EAGAIN;
>   
>   	return 0;
> @@ -4578,12 +4612,61 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp
>   		 */
>   		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
>   		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> -			amdgpu_device_unlock_adev(tmp_adev);
> +			amdgpu_device_recovery_exit(tmp_adev);
>   		}
>   	}
>   	return -EAGAIN;
>   }
>   
> +static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
> +				struct amdgpu_hive_info *hive)
> +{
> +	amdgpu_write_lock(adev, hive);
> +
> +	switch (amdgpu_asic_reset_method(adev)) {
> +	case AMD_RESET_METHOD_MODE1:
> +		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
> +		break;
> +	case AMD_RESET_METHOD_MODE2:
> +		adev->mp1_state = PP_MP1_STATE_RESET;
> +		break;
> +	default:
> +		adev->mp1_state = PP_MP1_STATE_NONE;
> +		break;
> +	}
> +}
> +
> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> +{
> +	amdgpu_vf_error_trans_all(adev);
> +	adev->mp1_state = PP_MP1_STATE_NONE;
> +	amdgpu_write_unlock(adev);
> +}
> +
> +/*
> + * to lockup a list of amdgpu devices in a hive safely, if not a hive
> + * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
> + *
> + * unlock won't require roll back.
> + */
> +static void amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +{
> +	struct amdgpu_device *tmp_adev = NULL;
> +
> +	if (adev->gmc.xgmi.num_physical_nodes > 1) {
> +		if (!hive) {
> +			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
> +			return;
> +		}
> +
> +		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> +			amdgpu_device_lock_adev(tmp_adev, hive);
> +		}
> +	} else {
> +		amdgpu_device_lock_adev(adev, hive);
> +	}
> +}
> +
>   static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
>   {
>   	struct pci_dev *p = NULL;
> @@ -4732,6 +4815,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	bool need_emergency_restart = false;
>   	bool audio_suspended = false;
>   	int tmp_vram_lost_counter;
> +	bool locked = false;
>   
>   	/*
>   	 * Special case: RAS triggered and full reset isn't supported
> @@ -4777,7 +4861,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	 * if didn't get the device lock, don't touch the linked list since
>   	 * others may iterating it.
>   	 */
> -	r = amdgpu_device_lock_hive_adev(adev, hive);
> +	r = amdgpu_hive_recovery_enter(adev, hive);
>   	if (r) {
>   		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
>   					job ? job->base.id : -1);
> @@ -4884,6 +4968,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	}
>   
>   	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
> +	/*
> +	 * Pre reset functions called before lock, which make sure other threads
> +	 * who own reset lock exit successfully. No other thread runs in the driver
> +	 * while the recovery thread runs
> +	 */
> +	if (!locked) {
> +		amdgpu_device_lock_hive_adev(adev, hive);
> +		locked = true;
> +	}
> +
>   	/* Actual ASIC resets if needed.*/
>   	/* TODO Implement XGMI hive reset logic for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
> @@ -4955,7 +5049,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   		if (audio_suspended)
>   			amdgpu_device_resume_display_audio(tmp_adev);
> -		amdgpu_device_unlock_adev(tmp_adev);
> +		amdgpu_device_recovery_exit(tmp_adev);
> +		if (locked)
> +			amdgpu_device_unlock_adev(tmp_adev);
>   	}
>   
>   skip_recovery:
> @@ -5199,9 +5295,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
>   		 * Locking adev->reset_sem will prevent any external access
>   		 * to GPU during PCI error recovery
>   		 */
> -		while (!amdgpu_device_lock_adev(adev, NULL))
> +		while (!amdgpu_device_recovery_enter(adev))
>   			amdgpu_cancel_all_tdr(adev);
>   
> +		amdgpu_device_lock_adev(adev, NULL);
>   		/*
>   		 * Block any work scheduling as we do for regular GPU reset
>   		 * for the duration of the recovery