[PATCH v2] drm/amdgpu: Fix the logic for NPS request failure

Lazar, Lijo lijo.lazar at amd.com
Tue Oct 22 13:25:22 UTC 2024


<ping>

On 10/18/2024 12:05 PM, Lijo Lazar wrote:
> On a hive, NPS request is placed by the first one for all devices in the
> hive. If the request fails, mark the mode as UNKNOWN so that subsequent
> devices on unload don't request it. Also, fix the mutex double lock
> issue in error condition, should have been mutex_unlock.
> 
> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
> Reviewed-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> 
> Fixes: 44d5206ec07c ("drm/amdgpu: Place NPS mode request on unload")
> ---
> v2: Add a debug log for debug purpose (Rajneesh)
> 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 ++++++++++++++----------
>  1 file changed, 16 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index fcdbcff57632..3ef5066ca529 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -1586,26 +1586,30 @@ int amdgpu_xgmi_request_nps_change(struct amdgpu_device *adev,
>  	 * devices don't request anymore.
>  	 */
>  	mutex_lock(&hive->hive_lock);
> +	if (atomic_read(&hive->requested_nps_mode) ==
> +	    UNKNOWN_MEMORY_PARTITION_MODE) {
> +		dev_dbg(adev->dev, "Unexpected entry for hive NPS change");
> +		mutex_unlock(&hive->hive_lock);
> +		return 0;
> +	}
>  	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>  		r = adev->gmc.gmc_funcs->request_mem_partition_mode(
>  			tmp_adev, req_nps_mode);
>  		if (r)
> -			goto err;
> +			break;
> +	}
> +	if (r) {
> +		/* Request back current mode if one of the requests failed */
> +		cur_nps_mode =
> +			adev->gmc.gmc_funcs->query_mem_partition_mode(tmp_adev);
> +		list_for_each_entry_continue_reverse(
> +			tmp_adev, &hive->device_list, gmc.xgmi.head)
> +			adev->gmc.gmc_funcs->request_mem_partition_mode(
> +				tmp_adev, cur_nps_mode);
>  	}
>  	/* Set to UNKNOWN so that other devices don't request anymore */
>  	atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE);
> -
>  	mutex_unlock(&hive->hive_lock);
>  
> -	return 0;
> -err:
> -	/* Request back current mode if one of the requests failed */
> -	cur_nps_mode = adev->gmc.gmc_funcs->query_mem_partition_mode(tmp_adev);
> -	list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list,
> -					     gmc.xgmi.head)
> -		adev->gmc.gmc_funcs->request_mem_partition_mode(tmp_adev,
> -								cur_nps_mode);
> -	mutex_lock(&hive->hive_lock);
> -
>  	return r;
>  }


More information about the amd-gfx mailing list