[RFC v3 10/12] drm/amdgpu: Move in_gpu_reset into reset_domain

Lazar, Lijo lijo.lazar at amd.com
Tue Feb 8 10:49:20 UTC 2022



On 1/26/2022 4:07 AM, Andrey Grodzovsky wrote:
> We should have a single instance per entrire reset domain.
> 
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> Suggested-by: Lijo Lazar <lijo.lazar at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 ++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++++++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  4 ++--
>   6 files changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f021cd3c9d34..087796e389ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1056,7 +1056,6 @@ struct amdgpu_device {
>   	bool				in_s4;
>   	bool				in_s0ix;
>   
> -	atomic_t 			in_gpu_reset;
>   	enum pp_mp1_state               mp1_state;
>   	struct amdgpu_doorbell_index doorbell_index;
>   
> @@ -1461,8 +1460,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
>          return adev->gmc.tmz_enabled;
>   }
>   
> -static inline int amdgpu_in_reset(struct amdgpu_device *adev)
> -{
> -	return atomic_read(&adev->in_gpu_reset);
> -}
> +int amdgpu_in_reset(struct amdgpu_device *adev);
> +
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 6991ab4a8191..aa43af443ebe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3511,7 +3511,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	mutex_init(&adev->mn_lock);
>   	mutex_init(&adev->virt.vf_errors.lock);
>   	hash_init(adev->mn_hash);
> -	atomic_set(&adev->in_gpu_reset, 0);
>   	mutex_init(&adev->psp.mutex);
>   	mutex_init(&adev->notifier_lock);
>   
> @@ -4775,7 +4774,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>   static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
>   				struct amdgpu_hive_info *hive)
>   {
> -	atomic_set(&adev->in_gpu_reset, 1);
> +	atomic_set(&adev->reset_domain->in_gpu_reset, 1);
>   
>   	if (hive) {
>   		down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock);
> @@ -4800,7 +4799,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   {
>   	amdgpu_vf_error_trans_all(adev);
>   	adev->mp1_state = PP_MP1_STATE_NONE;
> -	atomic_set(&adev->in_gpu_reset, 0);
> +	atomic_set(&adev->reset_domain->in_gpu_reset, 0);
>   	up_write(&adev->reset_domain->sem);
>   }
>   
> @@ -5643,3 +5642,8 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
>   
>   	amdgpu_asic_invalidate_hdp(adev, ring);
>   }
> +
> +int amdgpu_in_reset(struct amdgpu_device *adev)
> +{
> +	return atomic_read(&adev->reset_domain->in_gpu_reset);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 011585e330f6..e9b804a89b34 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -127,6 +127,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(char *wq_name)
>   
>   	}
>   
> +	atomic_set(&reset_domain->in_gpu_reset, 0);
>   	init_rwsem(&reset_domain->sem);
>   
>   	return reset_domain;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 7451089b0c06..413982f4e1ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -74,6 +74,7 @@ struct amdgpu_reset_domain {
>   	struct kref refcount;
>   	struct workqueue_struct *wq;
>   	struct rw_semaphore sem;
> +	atomic_t in_gpu_reset;

Maybe 'active' (independent of gpu) just to indicate that a reset is 
ongoing in the domain?

Thanks,
Lijo

>   };
>   
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 5dab06fce26a..6c79746d18db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -258,7 +258,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   		return;
>   
>   	amdgpu_virt_fini_data_exchange(adev);
> -	atomic_set(&adev->in_gpu_reset, 1);
> +	atomic_set(&adev->reset_domain->in_gpu_reset, 1);
>   
>   	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>   
> @@ -271,7 +271,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	} while (timeout > 1);
>   
>   flr_done:
> -	atomic_set(&adev->in_gpu_reset, 0);
> +	atomic_set(&adev->reset_domain->in_gpu_reset, 0);
>   	up_write(&adev->reset_domain->sem);
>   
>   	/* Trigger recovery for world switch failure if no TDR */
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 868144fff16a..39f7e1e9ab81 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -287,7 +287,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   		return;
>   
>   	amdgpu_virt_fini_data_exchange(adev);
> -	atomic_set(&adev->in_gpu_reset, 1);
> +	atomic_set(&adev->reset_domain->in_gpu_reset, 1);
>   
>   	xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>   
> @@ -300,7 +300,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   	} while (timeout > 1);
>   
>   flr_done:
> -	atomic_set(&adev->in_gpu_reset, 0);
> +	atomic_set(&adev->reset_domain->in_gpu_reset, 0);
>   	up_write(&adev->reset_domain->sem);
>   
>   	/* Trigger recovery for world switch failure if no TDR */
> 


More information about the amd-gfx mailing list