[PATCH] drm/amdgpu: add mutex to protect ras shared memory

Lazar, Lijo lijo.lazar at amd.com
Mon Apr 29 07:30:38 UTC 2024



On 4/28/2024 12:38 PM, YiPeng Chai wrote:
> Add mutex to protect ras shared memory.
> 
> Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    | 121 ++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h    |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c |   2 +
>  3 files changed, 84 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 5583e2d1b12f..fa4fea00f6b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -1564,6 +1564,66 @@ static void psp_ras_ta_check_status(struct psp_context *psp)
>  	}
>  }
>  
> +static int psp_ras_send_cmd(struct psp_context *psp,
> +		enum ras_command cmd_id, void *in, void *out)
> +{
> +	struct ta_ras_shared_memory *ras_cmd;
> +	uint32_t cmd = cmd_id;
> +	int ret = 0;
> +
> +	mutex_lock(&psp->ras_context.mutex);
> +	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> +	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> +
> +	switch (cmd) {
> +	case TA_RAS_COMMAND__ENABLE_FEATURES:
> +	case TA_RAS_COMMAND__DISABLE_FEATURES:
> +		memcpy(&ras_cmd->ras_in_message,
> +			in, sizeof(ras_cmd->ras_in_message));
> +		break;
> +	case TA_RAS_COMMAND__TRIGGER_ERROR:
> +		memcpy(&ras_cmd->ras_in_message.trigger_error,
> +			in, sizeof(ras_cmd->ras_in_message.trigger_error));
> +		break;
> +	case TA_RAS_COMMAND__QUERY_ADDRESS:
> +		memcpy(&ras_cmd->ras_in_message.address,
> +			in, sizeof(ras_cmd->ras_in_message.address));
> +		break;
> +	default:
> +		dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd);
> +		ret = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	ras_cmd->cmd_id = cmd;
> +	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> +
> +	switch (cmd) {
> +	case TA_RAS_COMMAND__TRIGGER_ERROR:
> +		if (out) {

As NULL check for 'out' is done before copying, better to do the same
for 'in' also.
> +			uint32_t *ras_status = (uint32_t *)out;
> +
> +			*ras_status = ras_cmd->ras_status;
> +		}
> +		break;
> +	case TA_RAS_COMMAND__QUERY_ADDRESS:
> +		if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
> +			ret = -EINVAL;
> +		else if (out)
> +			memcpy(out,
> +				&ras_cmd->ras_out_message.address,
> +				sizeof(ras_cmd->ras_out_message.address));
> +		break;
> +	default:
> +		break;
> +	}
> +
> +err_out:
> +	mutex_unlock(&psp->ras_context.mutex);
> +
> +	return ret;
> +}
> +
>  int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
>  {
>  	struct ta_ras_shared_memory *ras_cmd;
> @@ -1605,23 +1665,15 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
>  int psp_ras_enable_features(struct psp_context *psp,
>  		union ta_ras_cmd_input *info, bool enable)
>  {
> -	struct ta_ras_shared_memory *ras_cmd;
> +	enum ras_command cmd_id;
>  	int ret;
>  
> -	if (!psp->ras_context.context.initialized)
> +	if (!psp->ras_context.context.initialized || !info)
>  		return -EINVAL;
>  
> -	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> -	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> -	if (enable)
> -		ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES;
> -	else
> -		ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES;
> -
> -	ras_cmd->ras_in_message = *info;
> -
> -	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> +	cmd_id = enable ?
> +		TA_RAS_COMMAND__ENABLE_FEATURES : TA_RAS_COMMAND__DISABLE_FEATURES;
> +	ret = psp_ras_send_cmd(psp, cmd_id, info, NULL);
>  	if (ret)
>  		return -EINVAL;
>  
> @@ -1645,6 +1697,8 @@ int psp_ras_terminate(struct psp_context *psp)
>  
>  	psp->ras_context.context.initialized = false;
>  
> +	mutex_destroy(&psp->ras_context.mutex);
> +
>  	return ret;
>  }
>  
> @@ -1729,9 +1783,10 @@ int psp_ras_initialize(struct psp_context *psp)
>  
>  	ret = psp_ta_load(psp, &psp->ras_context.context);
>  
> -	if (!ret && !ras_cmd->ras_status)
> +	if (!ret && !ras_cmd->ras_status) {
>  		psp->ras_context.context.initialized = true;
> -	else {
> +		mutex_init(&psp->ras_context.mutex);
> +	} else {
>  		if (ras_cmd->ras_status)
>  			dev_warn(adev->dev, "RAS Init Status: 0x%X\n", ras_cmd->ras_status);
>  
> @@ -1745,12 +1800,12 @@ int psp_ras_initialize(struct psp_context *psp)
>  int psp_ras_trigger_error(struct psp_context *psp,
>  			  struct ta_ras_trigger_error_input *info, uint32_t instance_mask)
>  {
> -	struct ta_ras_shared_memory *ras_cmd;
>  	struct amdgpu_device *adev = psp->adev;
>  	int ret;
>  	uint32_t dev_mask;
> +	uint32_t ras_status;
>  
> -	if (!psp->ras_context.context.initialized)
> +	if (!psp->ras_context.context.initialized || !info)
>  		return -EINVAL;
>  
>  	switch (info->block_id) {
> @@ -1774,13 +1829,8 @@ int psp_ras_trigger_error(struct psp_context *psp,
>  	dev_mask &= AMDGPU_RAS_INST_MASK;
>  	info->sub_block_index |= dev_mask;
>  
> -	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> -	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> -	ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR;
> -	ras_cmd->ras_in_message.trigger_error = *info;
> -
> -	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> +	ret = psp_ras_send_cmd(psp,
> +			TA_RAS_COMMAND__TRIGGER_ERROR, info, &ras_status);
>  	if (ret)
>  		return -EINVAL;
>  
> @@ -1790,9 +1840,9 @@ int psp_ras_trigger_error(struct psp_context *psp,
>  	if (amdgpu_ras_intr_triggered())
>  		return 0;
>  
> -	if (ras_cmd->ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
> +	if (ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
>  		return -EACCES;
> -	else if (ras_cmd->ras_status)
> +	else if (ras_status)
>  		return -EINVAL;
>  
>  	return 0;
> @@ -1802,25 +1852,16 @@ int psp_ras_query_address(struct psp_context *psp,
>  			  struct ta_ras_query_address_input *addr_in,
>  			  struct ta_ras_query_address_output *addr_out)
>  {
> -	struct ta_ras_shared_memory *ras_cmd;
>  	int ret;
>  
> -	if (!psp->ras_context.context.initialized)
> -		return -EINVAL;
> -
> -	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> -	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> -	ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS;
> -	ras_cmd->ras_in_message.address = *addr_in;
> -
> -	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> -	if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
> +	if (!psp->ras_context.context.initialized ||
> +		!addr_in || !addr_out)
>  		return -EINVAL;
>  
> -	*addr_out = ras_cmd->ras_out_message.address;
> +	ret = psp_ras_send_cmd(psp,
> +			TA_RAS_COMMAND__QUERY_ADDRESS, addr_in, addr_out);

return psp_ras_send_cmd() will do.
>  
> -	return 0;
> +	return ret;
>  }
>  // ras end
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> index ee16f134ae92..686023918ce3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> @@ -197,6 +197,7 @@ struct psp_xgmi_context {
>  struct psp_ras_context {
>  	struct ta_context		context;
>  	struct amdgpu_ras		*ras;
> +	struct mutex			mutex;
>  };
>  
>  #define MEM_TRAIN_SYSTEM_SIGNATURE		0x54534942
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> index ca5c86e5f7cd..87f213f92d83 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> @@ -348,6 +348,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
>  
>  	context->session_id = ta_id;
>  
> +	mutex_lock(&psp->ras_context.mutex);
>  	ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len);
>  	if (ret)
>  		goto err_free_shared_buf;
> @@ -366,6 +367,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
>  		ret = -EFAULT;
>  
>  err_free_shared_buf:

This error label is used at other places as well and that happens even
before acquiring the mutex.

Thanks,
Lijo
> +	mutex_unlock(&psp->ras_context.mutex);
>  	kfree(shared_buf);
>  
>  	return ret;


More information about the amd-gfx mailing list