[RFC v4 10/11] drm/amdgpu: Rework amdgpu_device_lock_adev
Christian König
ckoenig.leichtzumerken at gmail.com
Wed Feb 9 08:04:15 UTC 2022
Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
> This functions needs to be split into 2 parts where
> one is called only once for locking single instance of
> reset_domain's sem and reset flag and the other part
> which handles MP1 states should still be called for
> each device in XGMI hive.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 48 ++++++++++++++++------
> 1 file changed, 35 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e05d7cbefd2c..aaecf0797484 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4825,16 +4825,20 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
> return r;
> }
>
> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
> - struct amdgpu_hive_info *hive)
> +static void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain,
> + struct amdgpu_hive_info *hive)
> {
> - atomic_set(&adev->reset_domain->in_gpu_reset, 1);
> + atomic_set(&reset_domain->in_gpu_reset, 1);
>
> if (hive) {
> - down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock);
> + down_write_nest_lock(&reset_domain->sem, &hive->hive_lock);
> } else {
> - down_write(&adev->reset_domain->sem);
> + down_write(&reset_domain->sem);
> }
> +}
> +
> +static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
> +{
>
> switch (amdgpu_asic_reset_method(adev)) {
> case AMD_RESET_METHOD_MODE1:
> @@ -4849,14 +4853,19 @@ static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
> }
> }
>
> -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> +static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
> {
> amdgpu_vf_error_trans_all(adev);
> adev->mp1_state = PP_MP1_STATE_NONE;
> - atomic_set(&adev->reset_domain->in_gpu_reset, 0);
> - up_write(&adev->reset_domain->sem);
> }
>
> +static void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
> +{
> + atomic_set(&reset_domain->in_gpu_reset, 0);
> + up_write(&reset_domain->sem);
> +}
I would move this into amdgpu_reset.c and call it
amdgpu_reset_domain_unlock().
Same of course for amdgpu_reset_domain_lock()...
Apart from that looks good to me,
Christian.
> +
> +
> static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
> {
> struct pci_dev *p = NULL;
> @@ -5060,10 +5069,15 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
> device_list_handle = &device_list;
> }
>
> + /* We need to lock reset domain only once both for XGMI and single device */
> + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
> + reset_list);
> + amdgpu_device_lock_reset_domain(tmp_adev->reset_domain, hive);
> +
> /* block all schedulers and reset given job's ring */
> list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>
> - amdgpu_device_lock_adev(tmp_adev, hive);
> + amdgpu_device_set_mp1_state(tmp_adev);
>
> /*
> * Try to put the audio codec into suspend state
> @@ -5213,9 +5227,14 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>
> if (audio_suspended)
> amdgpu_device_resume_display_audio(tmp_adev);
> - amdgpu_device_unlock_adev(tmp_adev);
> +
> + amdgpu_device_unset_mp1_state(tmp_adev);
> }
>
> + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
> + reset_list);
> + amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
> +
> if (hive) {
> mutex_unlock(&hive->hive_lock);
> amdgpu_put_xgmi_hive(hive);
> @@ -5477,7 +5496,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
> * Locking adev->reset_domain->sem will prevent any external access
> * to GPU during PCI error recovery
> */
> - amdgpu_device_lock_adev(adev, NULL);
> + amdgpu_device_lock_reset_domain(adev->reset_domain, NULL);
> + amdgpu_device_set_mp1_state(adev);
>
> /*
> * Block any work scheduling as we do for regular GPU reset
> @@ -5584,7 +5604,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
> DRM_INFO("PCIe error recovery succeeded\n");
> } else {
> DRM_ERROR("PCIe error recovery failed, err:%d", r);
> - amdgpu_device_unlock_adev(adev);
> + amdgpu_device_unset_mp1_state(adev);
> + amdgpu_device_unlock_reset_domain(adev->reset_domain);
> }
>
> return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
> @@ -5621,7 +5642,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
> drm_sched_start(&ring->sched, true);
> }
>
> - amdgpu_device_unlock_adev(adev);
> + amdgpu_device_unset_mp1_state(adev);
> + amdgpu_device_unlock_reset_domain(adev->reset_domain);
> }
>
> bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
More information about the dri-devel
mailing list