[PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v3)

Grodzovsky, Andrey Andrey.Grodzovsky at amd.com
Mon Jan 7 20:33:20 UTC 2019


Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>

Andrey


On 01/07/2019 12:31 PM, StDenis, Tom wrote:
> v2: Move locks around in other functions so that this
> function can stand on its own.  Also only hold the hive
> specific lock for add/remove device instead of the driver
> global lock so you can't add/remove devices in parallel from
> one hive.
>
> v3: add reset_lock
>
> Signed-off-by: Tom St Denis <tom.stdenis at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  6 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 41 ++++++++++++++--------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  5 +--
>   3 files changed, 33 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 39d5d058b2c7..1a558dc41ba6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	 * by different nodes. No point also since the one node already executing
>   	 * reset will also reset all the other nodes in the hive.
>   	 */
> -	hive = amdgpu_get_xgmi_hive(adev);
> +	hive = amdgpu_get_xgmi_hive(adev, 0);
>   	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
> -	    !mutex_trylock(&hive->hive_lock))
> +	    !mutex_trylock(&hive->reset_lock))
>   		return 0;
>   
>   	/* Start with adev pre asic reset first for soft reset check.*/
> @@ -3606,7 +3606,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	}
>   
>   	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
> -		mutex_unlock(&hive->hive_lock);
> +		mutex_unlock(&hive->reset_lock);
>   
>   	if (r)
>   		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 8a8bc60cb6b4..9e98ab8b1525 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
>   	return &hive->device_list;
>   }
>   
> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
>   {
>   	int i;
>   	struct amdgpu_hive_info *tmp;
>   
>   	if (!adev->gmc.xgmi.hive_id)
>   		return NULL;
> +
> +	mutex_lock(&xgmi_mutex);
> +
>   	for (i = 0 ; i < hive_count; ++i) {
>   		tmp = &xgmi_hives[i];
> -		if (tmp->hive_id == adev->gmc.xgmi.hive_id)
> +		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
> +			if (lock)
> +				mutex_lock(&tmp->hive_lock);
> +			mutex_unlock(&xgmi_mutex);
>   			return tmp;
> +		}
>   	}
> -	if (i >= AMDGPU_MAX_XGMI_HIVE)
> +	if (i >= AMDGPU_MAX_XGMI_HIVE) {
> +		mutex_unlock(&xgmi_mutex);
>   		return NULL;
> +	}
>   
>   	/* initialize new hive if not exist */
>   	tmp = &xgmi_hives[hive_count++];
>   	tmp->hive_id = adev->gmc.xgmi.hive_id;
>   	INIT_LIST_HEAD(&tmp->device_list);
>   	mutex_init(&tmp->hive_lock);
> +	mutex_init(&tmp->reset_lock);
> +	if (lock)
> +		mutex_lock(&tmp->hive_lock);
> +
> +	mutex_unlock(&xgmi_mutex);
>   
>   	return tmp;
>   }
> @@ -111,8 +125,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   		return ret;
>   	}
>   
> -	mutex_lock(&xgmi_mutex);
> -	hive = amdgpu_get_xgmi_hive(adev);
> +	/* find hive and take lock */
> +	hive = amdgpu_get_xgmi_hive(adev, 1);
>   	if (!hive)
>   		goto exit;
>   
> @@ -142,8 +156,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   			break;
>   	}
>   
> +	mutex_unlock(&hive->hive_lock);
>   exit:
> -	mutex_unlock(&xgmi_mutex);
>   	return ret;
>   }
>   
> @@ -154,15 +168,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
>   	if (!adev->gmc.xgmi.supported)
>   		return;
>   
> -	mutex_lock(&xgmi_mutex);
> -
> -	hive = amdgpu_get_xgmi_hive(adev);
> +	hive = amdgpu_get_xgmi_hive(adev, 1);
>   	if (!hive)
> -		goto exit;
> +		return;
>   
> -	if (!(hive->number_devices--))
> +	if (!(hive->number_devices--)) {
>   		mutex_destroy(&hive->hive_lock);
> -
> -exit:
> -	mutex_unlock(&xgmi_mutex);
> +		mutex_destroy(&hive->reset_lock);
> +	} else {
> +		mutex_unlock(&hive->hive_lock);
> +	}
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 6151eb9c8ad3..14bc60664159 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -29,10 +29,11 @@ struct amdgpu_hive_info {
>   	struct list_head	device_list;
>   	struct psp_xgmi_topology_info	topology_info;
>   	int number_devices;
> -	struct mutex hive_lock;
> +	struct mutex hive_lock,
> +		     reset_lock;
>   };
>   
> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
>   int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
>   int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
>   void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);



More information about the amd-gfx mailing list